Compare commits

..

1 Commits

Author SHA1 Message Date
Konstantin Knizhnik
779508caa9 Perform wal-redo by direct call of Postgres redo functions 2022-11-07 11:09:44 +02:00
126 changed files with 1946 additions and 5504 deletions

View File

@@ -55,22 +55,6 @@ runs:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
path: /tmp/neon
- name: Download Neon binaries for the previous release
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
path: /tmp/neon-previous
prefix: latest
- name: Download compatibility snapshot for Postgres 14
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
path: /tmp/compatibility_snapshot_pg14
prefix: latest
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v3
@@ -89,18 +73,23 @@ runs:
shell: bash -euxo pipefail {0}
run: ./scripts/pysync
- name: Download compatibility snapshot for Postgres 14
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14
path: /tmp/compatibility_snapshot_pg14
prefix: latest
- name: Run pytest
env:
NEON_BIN: /tmp/neon/bin
COMPATIBILITY_NEON_BIN: /tmp/neon-previous/bin
COMPATIBILITY_POSTGRES_DISTRIB_DIR: /tmp/neon-previous/pg_install
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: ${{ inputs.build_type }}
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
shell: bash -euxo pipefail {0}
run: |
# PLATFORM will be embedded in the perf test report
@@ -123,12 +112,7 @@ runs:
exit 1
fi
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
# -n4 uses four processes to run tests via pytest-xdist
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
# --dist=loadgroup points tests marked with @pytest.mark.xdist_group
# to the same worker to make @pytest.mark.order work with xdist
EXTRA_PARAMS="--dist=loadgroup $EXTRA_PARAMS"
fi
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
@@ -163,9 +147,9 @@ runs:
# --verbose prints name of each test (helpful when there are
# multiple tests in one file)
# -rA prints summary in the end
# -n4 uses four processes to run tests via pytest-xdist
# -s is not used to prevent pytest from capturing output, because tests are running
# in parallel and logs are mixed between different tests
#
mkdir -p $TEST_OUTPUT/allure/results
"${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
@@ -185,12 +169,12 @@ runs:
uses: ./.github/actions/upload
with:
name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
# The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
# The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
prefix: latest
- name: Create Allure report
if: success() || failure()
if: always()
uses: ./.github/actions/allure-report
with:
action: store

View File

@@ -1,3 +1,5 @@
zenith_install.tar.gz
.zenith_current_version
neon_install.tar.gz
.neon_current_version

View File

@@ -22,10 +22,6 @@ storage:
console_region_id: aws-us-west-2
zenith-1-ps-3:
console_region_id: aws-us-west-2
zenith-1-ps-4:
console_region_id: aws-us-west-2
zenith-1-ps-5:
console_region_id: aws-us-west-2
safekeepers:
hosts:

View File

@@ -1,33 +0,0 @@
storage:
vars:
bucket_name: neon-dev-storage-eu-west-1
bucket_region: eu-west-1
console_mgmt_base_url: http://console-staging.local
etcd_endpoints: etcd-0.eu-west-1.aws.neon.build:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:
bucket_name: "{{ bucket_name }}"
bucket_region: "{{ bucket_region }}"
prefix_in_bucket: "pageserver/v1"
safekeeper_s3_prefix: safekeeper/v1/wal
hostname_suffix: ""
remote_user: ssm-user
ansible_aws_ssm_region: eu-west-1
ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
console_region_id: aws-eu-west-1
children:
pageservers:
hosts:
pageserver-0.eu-west-1.aws.neon.build:
ansible_host: i-01d496c5041c7f34c
safekeepers:
hosts:
safekeeper-0.eu-west-1.aws.neon.build:
ansible_host: i-05226ef85722831bf
safekeeper-1.eu-west-1.aws.neon.build:
ansible_host: i-06969ee1bf2958bfc
safekeeper-2.eu-west-1.aws.neon.build:
ansible_host: i-087892e9625984a0b

View File

@@ -3,7 +3,7 @@ storage:
bucket_name: zenith-staging-storage-us-east-1
bucket_region: us-east-1
console_mgmt_base_url: http://console-staging.local
etcd_endpoints: etcd-0.us-east-2.aws.neon.build:2379
etcd_endpoints: zenith-us-stage-etcd.local:2379
pageserver_config_stub:
pg_distrib_dir: /usr/local
remote_storage:

View File

@@ -22,8 +22,6 @@ storage:
hosts:
pageserver-0.us-east-2.aws.neon.build:
ansible_host: i-0c3e70929edb5d691
pageserver-1.us-east-2.aws.neon.build:
ansible_host: i-0565a8b4008aa3f40
safekeepers:
hosts:

View File

@@ -1,31 +0,0 @@
# Helm chart values for neon-proxy-scram.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.eu-west-1.aws.neon.build"
# -- Additional labels for neon-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: dev
zenith_region: eu-west-1
zenith_region_slug: eu-west-1
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: eu-west-1.aws.neon.build
#metrics:
# enabled: true
# serviceMonitor:
# enabled: true
# selector:
# release: kube-prometheus-stack

View File

@@ -144,9 +144,7 @@ jobs:
# neon-captest-new: Run pgbench in a freshly created project
# neon-captest-reuse: Same, but reusing existing project
# neon-captest-prefetch: Same, with prefetching enabled (new project)
# rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-postgres ]
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch ]
db_size: [ 10gb ]
include:
- platform: neon-captest-new
@@ -166,7 +164,7 @@ jobs:
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
PLATFORM: ${{ matrix.platform }}
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
options: --init
@@ -209,11 +207,8 @@ jobs:
rds-aurora)
CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
;;
rds-postgres)
CONNSTR=${{ secrets.BENCHMARK_RDS_POSTGRES_CONNSTR }}
;;
*)
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch', 'rds-aurora', or 'rds-postgres'"
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'"
exit 1
;;
esac
@@ -270,7 +265,7 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
if: success() || failure()
if: always()
uses: ./.github/actions/allure-report
with:
action: generate

View File

@@ -18,8 +18,8 @@ env:
jobs:
tag:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
@@ -46,7 +46,7 @@ jobs:
id: build-tag
build-neon:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -236,7 +236,7 @@ jobs:
uses: ./.github/actions/save-coverage-data
regress-tests:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -268,8 +268,34 @@ jobs:
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
upload-latest-artifacts:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests ]
if: github.ref_name == 'main'
steps:
- name: Copy Neon artifact to the latest directory
shell: bash -euxo pipefail {0}
env:
BUCKET: neon-github-public-dev
PREFIX: artifacts/${{ github.run_id }}
run: |
for build_type in debug release; do
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
done
benchmarks:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -300,12 +326,12 @@ jobs:
# while coverage is currently collected for the debug ones
merge-allure-report:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests, benchmarks ]
if: success() || failure()
if: always()
strategy:
fail-fast: false
matrix:
@@ -338,7 +364,7 @@ jobs:
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
coverage-report:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -415,19 +441,15 @@ jobs:
shell: bash -euxo pipefail {0}
trigger-e2e-tests:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
options: --init
needs: [ push-docker-hub, tag ]
needs: [ build-neon ]
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
# For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
# but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
# to place a job run status update later.
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
# For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/cloud"
@@ -453,14 +475,12 @@ jobs:
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\",
\"storage_image_tag\": \"${{ needs.tag.outputs.build-tag }}\",
\"compute_image_tag\": \"${{ needs.tag.outputs.build-tag }}\"
\"remote_repo\": \"${{ github.repository }}\"
}
}"
neon-image:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
needs: [ tag ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
@@ -478,7 +498,7 @@ jobs:
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
compute-tools-image:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
needs: [ tag ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
@@ -492,8 +512,28 @@ jobs:
- name: Kaniko build compute tools
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
compute-node-image:
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
# compute-node uses postgres 14, which is default now
# cloud repo depends on this image name, thus duplicating it
# remove compute-node when cloud repo is updated
- name: Kaniko build compute node with extensions v14 (compatibility)
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}}
compute-node-image-v14:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
@@ -509,8 +549,9 @@ jobs:
- name: Kaniko build compute node with extensions v14
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v14 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}}
compute-node-image-v15:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: gcr.io/kaniko-project/executor:v1.9.0-debug
needs: [ tag ]
steps:
@@ -526,58 +567,18 @@ jobs:
- name: Kaniko build compute node with extensions v15
run: /kaniko/executor --skip-unused-stages --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-v15 --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}}
test-images:
needs: [ tag, neon-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
runs-on: [ self-hosted, dev, x64 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
# Regular pageserver version string looks like
# Neon page server git-env:32d14403bd6ab4f4520a94cbfd81a6acef7a526c failpoints: true, features: []
# Bad versions might loop like:
# Neon page server git-env:local failpoints: true, features: ["testing"]
# Ensure that we don't have bad versions.
- name: Verify image versions
shell: bash # ensure no set -e for better error messages
run: |
pageserver_version=$(docker run --rm 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} "/bin/sh" "-c" "/usr/local/bin/pageserver --version")
echo "Pageserver version string: $pageserver_version"
if ! echo "$pageserver_version" | grep -qv 'git-env:local' ; then
echo "Pageserver version should not be the default Dockerfile one"
exit 1
fi
if ! echo "$pageserver_version" | grep -qv '"testing"' ; then
echo "Pageserver version should have no testing feature enabled"
exit 1
fi
- name: Verify docker-compose example
run: env REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
- name: Print logs and clean up
if: always()
run: |
docker compose -f ./docker-compose/docker-compose.yml logs || 0
docker compose -f ./docker-compose/docker-compose.yml down
promote-images:
runs-on: [ self-hosted, dev, x64 ]
needs: [ tag, test-images ]
runs-on: dev
needs: [ tag, neon-image, compute-node-image, compute-node-image-v14, compute-node-image-v15, compute-tools-image ]
if: github.event_name != 'workflow_dispatch'
container: amazon/aws-cli
strategy:
fail-fast: false
matrix:
name: [ neon, compute-node-v14, compute-node-v15, compute-tools ]
# compute-node uses postgres 14, which is default now
# cloud repo depends on this image name, thus duplicating it
# remove compute-node when cloud repo is updated
name: [ neon, compute-node, compute-node-v14, compute-node-v15, compute-tools ]
steps:
- name: Promote image to latest
@@ -586,7 +587,7 @@ jobs:
aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
push-docker-hub:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
needs: [ promote-images, tag ]
container: golang:1.19-bullseye
@@ -607,6 +608,9 @@ jobs:
- name: Pull compute tools image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} compute-tools
- name: Pull compute node image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} compute-node
- name: Pull compute node v14 image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} compute-node-v14
@@ -623,6 +627,7 @@ jobs:
run: |
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/neon:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:latest
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} 093970136003.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:latest
@@ -638,6 +643,9 @@ jobs:
- name: Push compute tools image to Docker Hub
run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
- name: Push compute node image to Docker Hub
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
- name: Push compute node v14 image to Docker Hub
run: crane push compute-node-v14 neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}}
@@ -654,6 +662,7 @@ jobs:
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
@@ -736,7 +745,7 @@ jobs:
rm -f neon_install.tar.gz .neon_current_version
deploy-new:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -761,6 +770,7 @@ jobs:
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
@@ -769,38 +779,6 @@ jobs:
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
rm -f neon_install.tar.gz .neon_current_version
deploy-pr-test-new:
runs-on: [ self-hosted, dev, x64 ]
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, tag, regress-tests ]
if: |
contains(github.event.pull_request.labels.*.name, 'deploy-test-storage') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
target_region: [ eu-west-1 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Redeploy
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
./get_binaries.sh
ansible-galaxy collection install sivel.toiletwater
ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{secrets.NEON_STAGING_API_KEY}}
@@ -811,7 +789,7 @@ jobs:
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, tag, regress-tests ]
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
@@ -847,7 +825,7 @@ jobs:
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
@@ -889,10 +867,10 @@ jobs:
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
deploy-proxy-new:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, tag, regress-tests ]
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main') &&
github.event_name != 'workflow_dispatch'
@@ -904,8 +882,6 @@ jobs:
include:
- target_region: us-east-2
target_cluster: dev-us-east-2-beta
- target_region: eu-west-1
target_cluster: dev-eu-west-1-zeta
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -927,7 +903,7 @@ jobs:
runs-on: prod
container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, tag, regress-tests ]
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
@@ -960,8 +936,8 @@ jobs:
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
promote-compatibility-data:
runs-on: [ self-hosted, dev, x64 ]
promote-compatibility-test-snapshot:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
@@ -974,24 +950,9 @@ jobs:
BUCKET: neon-github-public-dev
PREFIX: artifacts/latest
run: |
# Update compatibility snapshot for the release
for build_type in debug release; do
OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
done
# Update Neon artifact for the release (reuse already uploaded artifact)
for build_type in debug release; do
OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
echo 2>&1 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME}
done

View File

@@ -115,7 +115,7 @@ jobs:
run: cargo build --locked --all --all-targets
check-rust-dependencies:
runs-on: [ self-hosted, dev, x64 ]
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init

2
.gitmodules vendored
View File

@@ -1,7 +1,7 @@
[submodule "vendor/postgres-v14"]
path = vendor/postgres-v14
url = https://github.com/neondatabase/postgres.git
branch = REL_14_STABLE_neon
branch = main
[submodule "vendor/postgres-v15"]
path = vendor/postgres-v15
url = https://github.com/neondatabase/postgres.git

View File

@@ -1,11 +0,0 @@
/compute_tools/ @neondatabase/control-plane
/control_plane/ @neondatabase/compute @neondatabase/storage
/libs/pageserver_api/ @neondatabase/compute @neondatabase/storage
/libs/postgres_ffi/ @neondatabase/compute
/libs/remote_storage/ @neondatabase/storage
/libs/safekeeper_api/ @neondatabase/safekeepers
/pageserver/ @neondatabase/compute @neondatabase/storage
/pgxn/ @neondatabase/compute
/proxy/ @neondatabase/control-plane
/safekeeper/ @neondatabase/safekeepers
/vendor/ @neondatabase/compute

8
Cargo.lock generated
View File

@@ -2255,14 +2255,6 @@ version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
[[package]]
name = "persistent_range_query"
version = "0.1.0"
dependencies = [
"rand",
"workspace_hack",
]
[[package]]
name = "petgraph"
version = "0.6.2"

View File

@@ -25,10 +25,6 @@ members = [
# Besides, debug info should not affect the performance.
debug = true
# disable debug symbols for all packages except this one to decrease binaries size
[profile.release.package."*"]
debug = false
[profile.release-line-debug]
inherits = "release"
debug = 1 # true = 2 = all symbols, 1 = line only

View File

@@ -0,0 +1,88 @@
#
# Legacy version of the Dockerfile for the compute node.
# Used by e2e CI. Building Dockerfile.compute-node will take
# unreasonable ammount of time without v2 runners.
#
# TODO: remove once cloud repo CI is moved to v2 runners.
#
# Allow specifiyng different compute-tools tag and image repo, so we are
# able to use different images
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=compute-tools
ARG TAG=latest
#
# Image with pre-built tools
#
FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
# Only to get ready compute_ctl binary as deppendency
#
# Image with Postgres build deps
#
FROM debian:bullseye-slim AS build-deps
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libcurl4-openssl-dev libossp-uuid-dev
#
# Image with built Postgres
#
FROM build-deps AS pg-build
# Add user postgres
RUN adduser postgres
RUN mkdir /pg && chown postgres:postgres /pg
# Copy source files
# version 14 is default for now
COPY ./vendor/postgres-v14 /pg/
COPY ./pgxn /pg/
# Build and install Postgres locally
RUN mkdir /pg/compute_build && cd /pg/compute_build && \
../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
# Install main binaries and contribs
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
# Install neon contrib
RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
USER postgres
WORKDIR /pg
#
# Final compute node image to be exported
#
FROM debian:bullseye-slim
# libreadline-dev is required to run psql
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute
# Copy ready Postgres binaries
COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
# Copy binaries from compute-tools
COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
# XXX: temporary symlink for compatibility with old control-plane
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
# Add postgres shared objects to the search path
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -10,7 +10,7 @@ POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
BUILD_TYPE ?= debug
ifeq ($(BUILD_TYPE),release)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
PG_CFLAGS = -O2 -g3 $(CFLAGS)
PG_CFLAGS = -fPIC -O2 -g3 $(CFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
else ifeq ($(BUILD_TYPE),debug)
@@ -20,18 +20,18 @@ else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
endif
# Seccomp BPF is only available for Linux
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
# Seccomp BPF is only available for Linux
PG_CONFIGURE_OPTS += --with-libseccomp
else ifeq ($(UNAME_S),Darwin)
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
endif
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
endif
# Use -C option so that when PostgreSQL "make install" installs the
@@ -73,8 +73,7 @@ $(POSTGRES_INSTALL_DIR)/build/v14/config.status:
+@echo "Configuring Postgres v14 build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v14
(cd $(POSTGRES_INSTALL_DIR)/build/v14 && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure \
CFLAGS='$(PG_CFLAGS)' \
$(ROOT_PROJECT_DIR)/vendor/postgres-v14/configure CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v14 > configure.log)
@@ -82,8 +81,7 @@ $(POSTGRES_INSTALL_DIR)/build/v15/config.status:
+@echo "Configuring Postgres v15 build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build/v15
(cd $(POSTGRES_INSTALL_DIR)/build/v15 && \
env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure \
CFLAGS='$(PG_CFLAGS)' \
$(ROOT_PROJECT_DIR)/vendor/postgres-v15/configure CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR))/v15 > configure.log)
@@ -113,8 +111,6 @@ postgres-v14: postgres-v14-configure \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14 MAKELEVEL=0 install
+@echo "Compiling libpq v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/src/interfaces/libpq install
+@echo "Compiling pg_prewarm v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache v14"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v14/contrib/pg_buffercache install
+@echo "Compiling pageinspect v14"
@@ -127,8 +123,6 @@ postgres-v15: postgres-v15-configure \
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15 MAKELEVEL=0 install
+@echo "Compiling libpq v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/src/interfaces/libpq install
+@echo "Compiling pg_prewarm v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_prewarm install
+@echo "Compiling pg_buffercache v15"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/v15/contrib/pg_buffercache install
+@echo "Compiling pageinspect v15"

View File

@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
1. Install XCode and dependencies
```
xcode-select --install
brew install protobuf etcd openssl flex bison
brew install protobuf etcd openssl
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -125,23 +125,24 @@ Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (r
# Create repository in .neon with proper paths to binaries and data
# Later that would be responsibility of a package install script
> ./target/debug/neon_local init
Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2545906
Successfully initialized timeline de200bd42b49cc1814412c7e592dd6e9
Stopped pageserver 1 process with pid 2545906
Starting pageserver at '127.0.0.1:64000' in '.neon'
Pageserver started
Successfully initialized timeline 7dd0907914ac399ff3be45fb252bfdb7
Stopping pageserver gracefully...done!
# start pageserver and safekeeper
> ./target/debug/neon_local start
Starting etcd broker using "/usr/bin/etcd"
etcd started, pid: 2545996
Starting pageserver at '127.0.0.1:64000' in '.neon'.
pageserver started, pid: 2546005
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'.
safekeeper 1 started, pid: 2546041
Starting etcd broker using /usr/bin/etcd
Starting pageserver at '127.0.0.1:64000' in '.neon'
Pageserver started
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
Safekeeper started
# start postgres compute node
> ./target/debug/neon_local pg start main
Starting new postgres (v14) main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

188
cli-v2-story.md Normal file
View File

@@ -0,0 +1,188 @@
Create a new Zenith repository in the current directory:
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli init
The files belonging to this database system will be owned by user "heikki".
This user must also own the server process.
The database cluster will be initialized with locale "en_GB.UTF-8".
The default database encoding has accordingly been set to "UTF8".
The default text search configuration will be set to "english".
Data page checksums are disabled.
creating directory tmp ... ok
creating subdirectories ... ok
selecting dynamic shared memory implementation ... posix
selecting default max_connections ... 100
selecting default shared_buffers ... 128MB
selecting default time zone ... Europe/Helsinki
creating configuration files ... ok
running bootstrap script ... ok
performing post-bootstrap initialization ... ok
syncing data to disk ... ok
initdb: warning: enabling "trust" authentication for local connections
You can change this by editing pg_hba.conf or using the option -A, or
--auth-local and --auth-host, the next time you run initdb.
new zenith repository was created in .zenith
Initially, there is only one branch:
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
main
Start a local Postgres instance on the branch:
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start main
Creating data directory from snapshot at 0/15FFB08...
waiting for server to start....2021-04-13 09:27:43.919 EEST [984664] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv6 address "::1", port 5432
2021-04-13 09:27:43.920 EEST [984664] LOG: listening on IPv4 address "127.0.0.1", port 5432
2021-04-13 09:27:43.927 EEST [984664] LOG: listening on Unix socket "/tmp/.s.PGSQL.5432"
2021-04-13 09:27:43.939 EEST [984665] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
2021-04-13 09:27:43.939 EEST [984665] LOG: creating missing WAL directory "pg_wal/archive_status"
2021-04-13 09:27:44.189 EEST [984665] LOG: database system was not properly shut down; automatic recovery in progress
2021-04-13 09:27:44.195 EEST [984665] LOG: invalid record length at 0/15FFB80: wanted 24, got 0
2021-04-13 09:27:44.195 EEST [984665] LOG: redo is not required
2021-04-13 09:27:44.225 EEST [984664] LOG: database system is ready to accept connections
done
server started
Run some commands against it:
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "create table foo (t text);"
CREATE TABLE
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "insert into foo values ('inserted on the main branch');"
INSERT 0 1
~/git-sandbox/zenith (cli-v2)$ psql postgres -c "select * from foo"
t
-----------------------------
inserted on the main branch
(1 row)
Create a new branch called 'experimental'. We create it from the
current end of the 'main' branch, but you could specify a different
LSN as the start point instead.
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch experimental main
branching at end of WAL: 0/161F478
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli branch
experimental
main
Start another Postgres instance off the 'experimental' branch:
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
Creating data directory from snapshot at 0/15FFB08...
waiting for server to start....2021-04-13 09:28:41.874 EEST [984766] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv6 address "::1", port 5433
2021-04-13 09:28:41.875 EEST [984766] LOG: listening on IPv4 address "127.0.0.1", port 5433
2021-04-13 09:28:41.883 EEST [984766] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
2021-04-13 09:28:41.896 EEST [984767] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
2021-04-13 09:28:42.265 EEST [984767] LOG: database system was not properly shut down; automatic recovery in progress
2021-04-13 09:28:42.269 EEST [984767] LOG: redo starts at 0/15FFB80
2021-04-13 09:28:42.272 EEST [984767] LOG: invalid record length at 0/161F4B0: wanted 24, got 0
2021-04-13 09:28:42.272 EEST [984767] LOG: redo done at 0/161F478 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
2021-04-13 09:28:42.321 EEST [984766] LOG: database system is ready to accept connections
done
server started
Insert some a row on the 'experimental' branch:
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
t
-----------------------------
inserted on the main branch
(1 row)
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "insert into foo values ('inserted on experimental')"
INSERT 0 1
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
t
-----------------------------
inserted on the main branch
inserted on experimental
(2 rows)
See that the other Postgres instance is still running on 'main' branch on port 5432:
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5432 -c "select * from foo"
t
-----------------------------
inserted on the main branch
(1 row)
Everything is stored in the .zenith directory:
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/
total 12
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 datadirs
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 refs
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:28 timelines
The 'datadirs' directory contains the datadirs of the running instances:
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/
total 8
drwx------ 18 heikki heikki 4096 Apr 13 09:27 3c0c634c1674079b2c6d4edf7c91523e
drwx------ 18 heikki heikki 4096 Apr 13 09:28 697e3c103d4b1763cd6e82e4ff361d76
~/git-sandbox/zenith (cli-v2)$ ls -l .zenith/datadirs/3c0c634c1674079b2c6d4edf7c91523e/
total 124
drwxr-xr-x 5 heikki heikki 4096 Apr 13 09:27 base
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 global
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_commit_ts
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_dynshmem
-rw------- 1 heikki heikki 4760 Apr 13 09:27 pg_hba.conf
-rw------- 1 heikki heikki 1636 Apr 13 09:27 pg_ident.conf
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:32 pg_logical
drwxr-xr-x 4 heikki heikki 4096 Apr 13 09:27 pg_multixact
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_notify
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_replslot
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_serial
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_snapshots
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_stat
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:34 pg_stat_tmp
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_subtrans
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_tblspc
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_twophase
-rw------- 1 heikki heikki 3 Apr 13 09:27 PG_VERSION
lrwxrwxrwx 1 heikki heikki 52 Apr 13 09:27 pg_wal -> ../../timelines/3c0c634c1674079b2c6d4edf7c91523e/wal
drwxr-xr-x 2 heikki heikki 4096 Apr 13 09:27 pg_xact
-rw------- 1 heikki heikki 88 Apr 13 09:27 postgresql.auto.conf
-rw------- 1 heikki heikki 28688 Apr 13 09:27 postgresql.conf
-rw------- 1 heikki heikki 96 Apr 13 09:27 postmaster.opts
-rw------- 1 heikki heikki 149 Apr 13 09:27 postmaster.pid
Note how 'pg_wal' is just a symlink to the 'timelines' directory. The
datadir is ephemeral, you can delete it at any time, and it can be reconstructed
from the snapshots and WAL stored in the 'timelines' directory. So if you push/pull
the repository, the 'datadirs' are not included. (They are like git working trees)
~/git-sandbox/zenith (cli-v2)$ killall -9 postgres
~/git-sandbox/zenith (cli-v2)$ rm -rf .zenith/datadirs/*
~/git-sandbox/zenith (cli-v2)$ ./target/debug/cli start experimental -- -o -p5433
Creating data directory from snapshot at 0/15FFB08...
waiting for server to start....2021-04-13 09:37:05.476 EEST [985340] LOG: starting PostgreSQL 14devel on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv6 address "::1", port 5433
2021-04-13 09:37:05.477 EEST [985340] LOG: listening on IPv4 address "127.0.0.1", port 5433
2021-04-13 09:37:05.487 EEST [985340] LOG: listening on Unix socket "/tmp/.s.PGSQL.5433"
2021-04-13 09:37:05.498 EEST [985341] LOG: database system was interrupted; last known up at 2021-04-13 09:27:33 EEST
2021-04-13 09:37:05.808 EEST [985341] LOG: database system was not properly shut down; automatic recovery in progress
2021-04-13 09:37:05.813 EEST [985341] LOG: redo starts at 0/15FFB80
2021-04-13 09:37:05.815 EEST [985341] LOG: invalid record length at 0/161F770: wanted 24, got 0
2021-04-13 09:37:05.815 EEST [985341] LOG: redo done at 0/161F738 system usage: CPU: user: 0.00 s, system: 0.00 s, elapsed: 0.00 s
2021-04-13 09:37:05.866 EEST [985340] LOG: database system is ready to accept connections
done
server started
~/git-sandbox/zenith (cli-v2)$ psql postgres -p5433 -c "select * from foo"
t
-----------------------------
inserted on the main branch
inserted on experimental
(2 rows)

View File

@@ -65,7 +65,7 @@ impl GenericOption {
let name = match self.name.as_str() {
"safekeepers" => "neon.safekeepers",
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
"wal_acceptor_connection_timeout" => "neon.safekeeper_connection_timeout",
"wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
it => it,
};

View File

@@ -26,18 +26,8 @@ use nix::unistd::Pid;
use utils::lock_file;
// These constants control the loop used to poll for process start / stop.
//
// The loop waits for at most 10 seconds, polling every 100 ms.
// Once a second, it prints a dot ("."), to give the user an indication that
// it's waiting. If the process hasn't started/stopped after 5 seconds,
// it prints a notice that it's taking long, but keeps waiting.
//
const RETRY_UNTIL_SECS: u64 = 10;
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
const RETRY_INTERVAL_MILLIS: u64 = 100;
const DOT_EVERY_RETRIES: u64 = 10;
const NOTICE_AFTER_RETRIES: u64 = 50;
const RETRIES: u32 = 15;
const RETRY_TIMEOUT_MILLIS: u64 = 500;
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
@@ -117,16 +107,16 @@ where
return Ok(spawned_process);
}
Ok(false) => {
if retries == NOTICE_AFTER_RETRIES {
// The process is taking a long time to start up. Keep waiting, but
// print a message
print!("\n{process_name} has not started yet, continuing to wait");
}
if retries % DOT_EVERY_RETRIES == 0 {
if retries < 5 {
print!(".");
io::stdout().flush().unwrap();
} else {
if retries == 5 {
println!() // put a line break after dots for second message
}
println!("{process_name} has not started yet, retrying ({retries})...");
}
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
thread::sleep(Duration::from_millis(RETRY_TIMEOUT_MILLIS));
}
Err(e) => {
println!("{process_name} failed to start: {e:#}");
@@ -137,8 +127,7 @@ where
}
}
}
println!();
anyhow::bail!("{process_name} did not start in {RETRY_UNTIL_SECS} seconds");
anyhow::bail!("{process_name} could not start in {RETRIES} attempts");
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -169,7 +158,7 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
}
// Wait until process is gone
for retries in 0..RETRIES {
for _ in 0..RETRIES {
match process_has_stopped(pid) {
Ok(true) => {
println!("\n{process_name} stopped");
@@ -181,16 +170,9 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
return Ok(());
}
Ok(false) => {
if retries == NOTICE_AFTER_RETRIES {
// The process is taking a long time to start up. Keep waiting, but
// print a message
print!("\n{process_name} has not stopped yet, continuing to wait");
}
if retries % DOT_EVERY_RETRIES == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
print!(".");
io::stdout().flush().unwrap();
thread::sleep(Duration::from_secs(1))
}
Err(e) => {
println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -198,21 +180,24 @@ pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> any
}
}
}
println!();
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
anyhow::bail!("{process_name} with pid {pid} failed to stop in {RETRIES} attempts");
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
let mut filled_cmd = cmd.env_clear().env("RUST_BACKTRACE", "1");
// Pass through these environment variables to the command
for var in ["LLVM_PROFILE_FILE", "FAILPOINTS", "RUST_LOG"] {
if let Some(val) = std::env::var_os(var) {
filled_cmd = filled_cmd.env(var, val);
}
let var = "LLVM_PROFILE_FILE";
if let Some(val) = std::env::var_os(var) {
filled_cmd = filled_cmd.env(var, val);
}
filled_cmd
const RUST_LOG_KEY: &str = "RUST_LOG";
if let Ok(rust_log_value) = std::env::var(RUST_LOG_KEY) {
filled_cmd.env(RUST_LOG_KEY, rust_log_value)
} else {
filled_cmd
}
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {

View File

@@ -343,7 +343,7 @@ impl PostgresNode {
// To be able to restore database in case of pageserver node crash, safekeeper should not
// remove WAL beyond this point. Too large lag can cause space exhaustion in safekeepers
// (if they are not able to upload WAL to S3).
conf.append("max_replication_write_lag", "15MB");
conf.append("max_replication_write_lag", "500MB");
conf.append("max_replication_flush_lag", "10GB");
if !self.env.safekeepers.is_empty() {

View File

@@ -6,7 +6,7 @@ use crate::{background_process, local_env};
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_broker = &env.etcd_broker;
print!(
println!(
"Starting etcd broker using {:?}",
etcd_broker.etcd_binary_path
);

View File

@@ -237,7 +237,7 @@ impl PageServerNode {
datadir: &Path,
update_config: bool,
) -> anyhow::Result<Child> {
print!(
println!(
"Starting pageserver at '{}' in '{}'",
self.pg_connection_config.raw_address(),
datadir.display()
@@ -362,11 +362,6 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.remove("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
};
if !settings.is_empty() {
bail!("Unrecognized tenant settings: {settings:?}")
@@ -429,11 +424,6 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.get("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
})
.send()?
.error_from_body()?;

View File

@@ -1,13 +0,0 @@
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG COMPUTE_IMAGE=compute-node-v14
ARG TAG=latest
FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
USER root
RUN apt-get update && \
apt-get install -y curl \
jq \
netcat
USER postgres

View File

@@ -2,7 +2,6 @@ version: '3'
services:
etcd:
restart: always
image: quay.io/coreos/etcd:v3.5.4
ports:
- 2379:2379
@@ -10,7 +9,7 @@ services:
environment:
# This signifficantly speeds up etcd and we anyway don't data persistency there.
ETCD_UNSAFE_NO_FSYNC: "1"
command:
command:
- "etcd"
- "--auto-compaction-mode=revision"
- "--auto-compaction-retention=1"
@@ -25,7 +24,6 @@ services:
- "--quota-backend-bytes=134217728" # 128 MB
minio:
restart: always
image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
ports:
- 9000:9000
@@ -43,7 +41,7 @@ services:
entrypoint:
- "/bin/sh"
- "-c"
command:
command:
- "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
echo 'Waiting to start minio...' && sleep 1;
done;
@@ -53,8 +51,7 @@ services:
- minio
pageserver:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
image: neondatabase/neon:${TAG:-latest}
environment:
- BROKER_ENDPOINT='http://etcd:2379'
- AWS_ACCESS_KEY_ID=minio
@@ -80,8 +77,7 @@ services:
- minio_create_buckets
safekeeper1:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
image: neondatabase/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
- SAFEKEEPER_ID=1
@@ -110,8 +106,7 @@ services:
- minio_create_buckets
safekeeper2:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
image: neondatabase/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
- SAFEKEEPER_ID=2
@@ -140,8 +135,7 @@ services:
- minio_create_buckets
safekeeper3:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
image: neondatabase/neon:${TAG:-latest}
environment:
- SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
- SAFEKEEPER_ID=3
@@ -170,21 +164,18 @@ services:
- minio_create_buckets
compute:
restart: always
build:
context: ./compute_wrapper/
context: ./image/compute
args:
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}
- TAG=${TAG:-latest}
- COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
- http_proxy=$http_proxy
- https_proxy=$https_proxy
environment:
- PG_VERSION=${PG_VERSION:-14}
#- RUST_BACKTRACE=1
# Mount the test files directly, for faster editing cycle.
volumes:
- ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
- ./compute_wrapper/shell/:/shell/
- ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
- ./compute/shell/:/shell/
ports:
- 55433:55433 # pg protocol handler
- 3080:3080 # http endpoints

View File

@@ -1,60 +0,0 @@
#!/bin/bash
# A basic test to ensure Docker images are built correctly.
# Build a wrapper around the compute, start all services and runs a simple SQL query.
# Repeats the process for all currenly supported Postgres versions.
# Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file
# Their defaults point at DockerHub `neondatabase/neon:latest` image.`,
# to verify custom image builds (e.g pre-published ones).
# XXX: Current does not work on M1 macs due to x86_64 Docker images compiled only, and no seccomp support in M1 Docker emulation layer.
set -eux -o pipefail
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml
COMPUTE_CONTAINER_NAME=docker-compose-compute-1
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
cleanup() {
echo "show container information"
docker ps
docker compose -f $COMPOSE_FILE logs
echo "stop containers..."
docker compose -f $COMPOSE_FILE down
}
echo "clean up containers if exists"
cleanup
for pg_version in 14 15; do
echo "start containers (pg_version=$pg_version)."
PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d
echo "wait until the compute is ready. timeout after 60s. "
cnt=0
while sleep 1; do
# check timeout
cnt=`expr $cnt + 1`
if [ $cnt -gt 60 ]; then
echo "timeout before the compute is ready."
cleanup
exit 1
fi
# check if the compute is ready
set +o pipefail
result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
set -o pipefail
if [ $result -eq 1 ]; then
echo "OK. The compute is ready to connect."
echo "execute simple queries."
docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
cleanup
break
fi
done
done

View File

@@ -0,0 +1,10 @@
ARG COMPUTE_IMAGE=compute-node-v14:latest
FROM neondatabase/${COMPUTE_IMAGE}
USER root
RUN apt-get update && \
apt-get install -y curl \
jq \
netcat
USER postgres

View File

@@ -37,7 +37,7 @@
- [Source view](./sourcetree.md)
- [docker.md](./docker.md) — Docker images and building pipeline.
- [Error handling and logging](./error-handling.md)
- [Error handling and logging]()
- [Testing]()
- [Unit testing]()
- [Integration testing]()

View File

@@ -1,198 +0,0 @@
# Error handling and logging
## Logging errors
The principle is that errors are logged when they are handled. If you
just propagate an error to the caller in a function, you don't need to
log it; the caller will. But if you consume an error in a function,
you *must* log it (if it needs to be logged at all).
For example:
```rust
fn read_motd_file() -> std::io::Result<String> {
let mut f = File::open("/etc/motd")?;
let mut result = String::new();
f.read_to_string(&mut result)?;
result
}
```
Opening or reading the file could fail, but there is no need to log
the error here. The function merely propagates the error to the
caller, and it is up to the caller to log the error or propagate it
further, if the failure is not expected. But if, for example, it is
normal that the "/etc/motd" file doesn't exist, the caller can choose
to silently ignore the error, or log it as an INFO or DEBUG level
message:
```rust
fn get_message_of_the_day() -> String {
// Get the motd from /etc/motd, or return the default proverb
match read_motd_file() {
Ok(motd) => motd,
Err(err) => {
// It's normal that /etc/motd doesn't exist, but if we fail to
// read it for some other reason, that's unexpected. The message
// of the day isn't very important though, so we just WARN and
// continue with the default in any case.
if err.kind() != std::io::ErrorKind::NotFound {
tracing::warn!("could not read \"/etc/motd\": {err:?}");
}
"An old error is always more popular than a new truth. - German proverb"
}
}
}
```
## Error types
We use the `anyhow` crate widely. It contains many convenient macros
like `bail!` and `ensure!` to construct and return errors, and to
propagate many kinds of low-level errors, wrapped in `anyhow::Error`.
A downside of `anyhow::Error` is that the caller cannot distinguish
between different error cases. Most errors are propagated all the way
to the mgmt API handler function, or the main loop that handles a
connection with the compute node, and they are all handled the same
way: the error is logged and returned to the client as an HTTP or
libpq error.
But in some cases, we need to distinguish between errors and handle
them differently. For example, attaching a tenant to the pageserver
could fail either because the tenant has already been attached, or
because we could not load its metadata from cloud storage. The first
case is more or less expected. The console sends the Attach request to
the pageserver, and the pageserver completes the operation, but the
network connection might be lost before the console receives the
response. The console will retry the operation in that case, but the
tenant has already been attached. It is important that the pagserver
responds with the HTTP 403 Already Exists error in that case, rather
than a generic HTTP 500 Internal Server Error.
If you need to distinguish between different kinds of errors, create a
new `Error` type. The `thiserror` crate is useful for that. But in
most cases `anyhow::Error` is good enough.
## Panics
Depending on where a panic happens, it can cause the whole pageserver
or safekeeper to restart, or just a single tenant. In either case,
that is pretty bad and causes an outage. Avoid panics. Never use
`unwrap()` or other calls that might panic, to verify inputs from the
network or from disk.
It is acceptable to use functions that might panic, like `unwrap()`, if
it is obvious that it cannot panic. For example, if you have just
checked that a variable is not None, it is OK to call `unwrap()` on it,
but it is still preferable to use `expect("reason")` instead to explain
why the function cannot fail.
`assert!` and `panic!` are reserved for checking clear invariants and
very obvious "can't happen" cases. When in doubt, use anyhow `ensure!`
or `bail!` instead.
## Error levels
`tracing::Level` doesn't provide very clear guidelines on what the
different levels mean, or when to use which level. Here is how we use
them:
### Error
Examples:
- could not open file "foobar"
- invalid tenant id
Errors are not expected to happen during normal operation. Incorrect
inputs from client can cause ERRORs. For example, if a client tries to
call a mgmt API that doesn't exist, or if a compute node sends passes
an LSN that has already been garbage collected away.
These should *not* happen during normal operations. "Normal
operations" is not a very precise concept. But for example, disk
errors are not expected to happen when the system is working, so those
count as Errors. However, if a TCP connection to a compute node is
lost, that is not considered an Error, because it doesn't affect the
pageserver's or safekeeper's operation in any way, and happens fairly
frequently when compute nodes are shut down, or are killed abruptly
because of errors in the compute.
**Errors are monitored, and always need human investigation to determine
the cause.**
Whether something should be logged at ERROR, WARNING or INFO level can
depend on the callers and clients. For example, it might be unexpected
and a sign of a serious issue if the console calls the
"timeline_detail" mgmt API for a timeline that doesn't exist. ERROR
would be appropriate in that case. But if the console routinely calls
the API after deleting a timeline, to check if the deletion has
completed, then it would be totally normal and an INFO or DEBUG level
message would be more appropriate. If a message is logged as an ERROR,
but it in fact happens frequently in production and never requires any
action, it should probably be demoted to an INFO level message.
### Warn
Examples:
- could not remove temporary file "foobar.temp"
- unrecognized file "foobar" in timeline directory
Warnings are similar to Errors, in that they should not happen
when the system is operating normally. The difference between Error and
Warning is that an Error means that the operation failed, whereas Warning
means that something unexpected happened, but the operation continued anyway.
For example, if deleting a file fails because the file already didn't exist,
it should be logged as Warning.
> **Note:** The python regression tests, under `test_regress`, check the
> pageserver log after each test for any ERROR and WARN lines. If there are
> any ERRORs or WARNs that have not been explicitly listed in the test as
> allowed, the test is marked a failed. This is to catch unexpected errors
> e.g. in background operations, that don't cause immediate misbehaviour in
> the tested functionality.
### Info
Info level is used to log useful information when the system is
operating normally. Info level is appropriate e.g. for logging state
changes, background operations, and network connections.
Examples:
- "system is shutting down"
- "tenant was created"
- "retrying S3 upload"
### Debug & Trace
Debug and Trace level messages are not printed to the log in our normal
production configuration, but could be enabled for a specific server or
tenant, to aid debugging. (Although we don't actually have that
capability as of this writing).
## Context
We use logging "spans" to hold context information about the current
operation. Almost every operation happens on a particular tenant and
timeline, so we enter a span with the "tenant_id" and "timeline_id"
very early when processing an incoming API request, for example. All
background operations should also run in a span containing at least
those two fields, and any other parameters or information that might
be useful when debugging an error that might happen when performing
the operation.
TODO: Spans are not captured in the Error when it is created, but when
the error is logged. It would be more useful to capture them at Error
creation. We should consider using `tracing_error::SpanTrace` to do
that.
## Error message style
PostgreSQL has a style guide for writing error messages:
https://www.postgresql.org/docs/current/error-style-guide.html
Follow that guide when writing error messages in the PostgreSQL
extension. We don't follow it strictly in the pageserver and
safekeeper, but the advice in the PostgreSQL style guide is generally
good, and you can't go wrong by following it.

View File

@@ -83,16 +83,6 @@ A subject for future modularization.
`/libs/metrics`:
Helpers for exposing Prometheus metrics from the server.
### Adding dependencies
When you add a Cargo dependency, you should update hakari manifest by running commands below and committing the updated `Cargo.lock` and `workspace_hack/`. There may be no changes, that's fine.
```bash
cargo hakari generate
cargo hakari manage-deps
```
If you don't have hakari installed (`error: no such subcommand: hakari`), install it by running `cargo install cargo-hakari`.
## Using Python
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
so manual installation of dependencies is not recommended.

View File

@@ -73,7 +73,6 @@ pub struct TenantCreateRequest {
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
#[serde_as]
@@ -113,7 +112,6 @@ pub struct TenantConfigRequest {
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
impl TenantConfigRequest {
@@ -132,7 +130,6 @@ impl TenantConfigRequest {
walreceiver_connect_timeout: None,
lagging_wal_timeout: None,
max_lsn_wal_lag: None,
trace_read_requests: None,
}
}
}

View File

@@ -1,12 +0,0 @@
[package]
name = "persistent_range_query"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[dev-dependencies]
rand = "0.8.3"

View File

@@ -1,78 +0,0 @@
use std::ops::Range;
pub mod naive;
pub mod ops;
pub mod segment_tree;
/// Should be a monoid:
/// * Identity element: for all a: combine(new_for_empty_range(), a) = combine(a, new_for_empty_range()) = a
/// * Associativity: for all a, b, c: combine(combine(a, b), c) == combine(a, combine(b, c))
pub trait RangeQueryResult<Key>: Sized + Clone {
// Clone is equivalent to combine with an empty range.
fn new_for_empty_range() -> Self;
// Contract: left_range.end == right_range.start
// left_range.start == left_range.end == right_range.start == right_range.end is still possible
fn combine(
left: &Self,
left_range: &Range<Key>,
right: &Self,
right_range: &Range<Key>,
) -> Self;
fn add(left: &mut Self, left_range: &Range<Key>, right: &Self, right_range: &Range<Key>);
}
pub trait LazyRangeInitializer<Result: RangeQueryResult<Key>, Key> {
fn get(&self, range: &Range<Key>) -> Result;
}
/// Should be a monoid:
/// * Identity element: for all op: compose(no_op(), op) == compose(op, no_op()) == op
/// * Associativity: for all op_1, op_2, op_3: compose(compose(op_1, op_2), op_3) == compose(op_1, compose(op_2, op_3))
///
/// Should left act on Result:
/// * Identity operation: for all r: no_op().apply(r) == r
/// * Compatibility: for all op_1, op_2, r: op_1.apply(op_2.apply(r)) == compose(op_1, op_2).apply(r)
pub trait RangeModification<Key> {
type Result: RangeQueryResult<Key>;
fn no_op() -> Self;
fn is_no_op(&self) -> bool;
fn is_reinitialization(&self) -> bool;
fn apply(&self, result: &mut Self::Result, range: &Range<Key>);
fn compose(later: &Self, earlier: &mut Self);
}
pub trait VecReadableVersion<Modification: RangeModification<Key>, Key> {
fn get(&self, keys: &Range<Key>) -> Modification::Result;
}
// TODO: use trait alias when stabilized
pub trait VecFrozenVersion<Modification: RangeModification<Key>, Key>:
Clone + VecReadableVersion<Modification, Key>
{
}
impl<
T: Clone + VecReadableVersion<Modification, Key>,
Modification: RangeModification<Key>,
Key,
> VecFrozenVersion<Modification, Key> for T
{
}
pub trait PersistentVecStorage<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key,
>: VecReadableVersion<Modification, Key>
{
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self;
type FrozenVersion: VecFrozenVersion<Modification, Key>;
fn modify(&mut self, keys: &Range<Key>, modification: &Modification);
fn freeze(&mut self) -> Self::FrozenVersion;
}

View File

@@ -1,115 +0,0 @@
use crate::{
LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
VecReadableVersion,
};
use std::marker::PhantomData;
use std::ops::Range;
use std::rc::Rc;
pub struct NaiveFrozenVersion<Modification: RangeModification<Key>, Key> {
all_keys: Range<Key>,
values: Rc<Box<Vec<Modification::Result>>>,
}
pub trait IndexableKey: Clone {
fn index(all_keys: &Range<Self>, key: &Self) -> usize;
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self>;
}
fn get<Modification: RangeModification<Key>, Key: IndexableKey>(
all_keys: &Range<Key>,
values: &Vec<Modification::Result>,
keys: &Range<Key>,
) -> Modification::Result {
let mut result = Modification::Result::new_for_empty_range();
let mut result_range = keys.start.clone()..keys.start.clone();
for index in
IndexableKey::index(&all_keys, &keys.start)..IndexableKey::index(&all_keys, &keys.end)
{
let element_range = IndexableKey::element_range(&all_keys, index);
Modification::Result::add(&mut result, &result_range, &values[index], &element_range);
result_range.end = element_range.end;
}
result
}
impl<Modification: RangeModification<Key>, Key: IndexableKey> VecReadableVersion<Modification, Key>
for NaiveFrozenVersion<Modification, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
get::<Modification, Key>(&self.all_keys, &self.values, keys)
}
}
// Manual implementation of `Clone` becase `derive` requires `Modification: Clone`
impl<Modification: RangeModification<Key>, Key: Clone> Clone
for NaiveFrozenVersion<Modification, Key>
{
fn clone(&self) -> Self {
Self {
all_keys: self.all_keys.clone(),
values: self.values.clone(),
}
}
}
// TODO: is it at all possible to store previous versions in this struct,
// without any Rc<>?
pub struct NaiveVecStorage<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: IndexableKey,
> {
all_keys: Range<Key>,
last_version: Vec<Modification::Result>,
_initializer: PhantomData<Initializer>,
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: IndexableKey,
> VecReadableVersion<Modification, Key> for NaiveVecStorage<Modification, Initializer, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
get::<Modification, Key>(&self.all_keys, &self.last_version, keys)
}
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: IndexableKey,
> PersistentVecStorage<Modification, Initializer, Key>
for NaiveVecStorage<Modification, Initializer, Key>
{
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
let mut values = Vec::with_capacity(IndexableKey::index(&all_keys, &all_keys.end));
for index in 0..values.capacity() {
values.push(initializer.get(&IndexableKey::element_range(&all_keys, index)));
}
NaiveVecStorage {
all_keys,
last_version: values,
_initializer: PhantomData,
}
}
type FrozenVersion = NaiveFrozenVersion<Modification, Key>;
fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
for index in IndexableKey::index(&self.all_keys, &keys.start)
..IndexableKey::index(&self.all_keys, &keys.end)
{
let element_range = IndexableKey::element_range(&self.all_keys, index);
modification.apply(&mut self.last_version[index], &element_range);
}
}
fn freeze(&mut self) -> Self::FrozenVersion {
NaiveFrozenVersion::<Modification, Key> {
all_keys: self.all_keys.clone(),
values: Rc::new(Box::new(self.last_version.clone())),
}
}
}

View File

@@ -1,14 +0,0 @@
pub mod rsq;
#[derive(Copy, Clone, Debug)]
pub struct SameElementsInitializer<T> {
initial_element_value: T,
}
impl<T> SameElementsInitializer<T> {
pub fn new(initial_element_value: T) -> Self {
SameElementsInitializer {
initial_element_value,
}
}
}

View File

@@ -1,118 +0,0 @@
//! # Range Sum Query
use crate::ops::SameElementsInitializer;
use crate::{LazyRangeInitializer, RangeModification, RangeQueryResult};
use std::borrow::Borrow;
use std::ops::{Add, AddAssign, Range};
// TODO: commutative Add
#[derive(Clone, Copy, Debug)]
pub struct SumResult<T> {
sum: T,
}
impl<T> SumResult<T> {
pub fn sum(&self) -> &T {
&self.sum
}
}
impl<T: Clone + for<'a> AddAssign<&'a T> + From<u8>, Key> RangeQueryResult<Key> for SumResult<T>
where
for<'a> &'a T: Add<&'a T, Output = T>,
{
fn new_for_empty_range() -> Self {
SumResult { sum: 0.into() }
}
fn combine(
left: &Self,
_left_range: &Range<Key>,
right: &Self,
_right_range: &Range<Key>,
) -> Self {
SumResult {
sum: &left.sum + &right.sum,
}
}
fn add(left: &mut Self, _left_range: &Range<Key>, right: &Self, _right_range: &Range<Key>) {
left.sum += &right.sum
}
}
pub trait SumOfSameElements<Key> {
fn sum(initial_element_value: &Self, keys: &Range<Key>) -> Self;
}
impl<T: SumOfSameElements<Key>, TB: Borrow<T>, Key> LazyRangeInitializer<SumResult<T>, Key>
for SameElementsInitializer<TB>
where
SumResult<T>: RangeQueryResult<Key>,
{
fn get(&self, range: &Range<Key>) -> SumResult<T> {
SumResult {
sum: SumOfSameElements::sum(self.initial_element_value.borrow(), range),
}
}
}
#[derive(Copy, Clone, Debug)]
pub enum AddAssignModification<T> {
None,
Add(T),
Assign(T),
}
impl<T: Clone + for<'a> AddAssign<&'a T>, Key> RangeModification<Key> for AddAssignModification<T>
where
SumResult<T>: RangeQueryResult<Key>,
for<'a> SameElementsInitializer<&'a T>: LazyRangeInitializer<SumResult<T>, Key>,
{
type Result = SumResult<T>;
fn no_op() -> Self {
AddAssignModification::None
}
fn is_no_op(&self) -> bool {
match self {
AddAssignModification::None => true,
_ => false,
}
}
fn is_reinitialization(&self) -> bool {
match self {
AddAssignModification::Assign(_) => true,
_ => false,
}
}
fn apply(&self, result: &mut SumResult<T>, range: &Range<Key>) {
use AddAssignModification::*;
match self {
None => {}
Add(x) | Assign(x) => {
let to_add = SameElementsInitializer::new(x).get(range).sum;
if let Assign(_) = self {
result.sum = to_add;
} else {
result.sum += &to_add;
}
}
}
}
fn compose(later: &Self, earlier: &mut Self) {
use AddAssignModification::*;
match (later, earlier) {
(_, e @ None) => *e = later.clone(),
(None, _) => {}
(Assign(_), e) => *e = later.clone(),
(Add(x), Add(y)) => *y += x,
(Add(x), Assign(value)) => *value += x,
}
}
}

View File

@@ -1,255 +0,0 @@
//! # Segment Tree
//! It is a competitive programming folklore data structure. Do not confuse with the interval tree.
use crate::{LazyRangeInitializer, PersistentVecStorage, RangeQueryResult, VecReadableVersion};
use std::ops::Range;
use std::rc::Rc;
pub trait MidpointableKey: Clone + Ord + Sized {
fn midpoint(range: &Range<Self>) -> Self;
}
pub trait RangeModification<Key>: Clone + crate::RangeModification<Key> {}
// TODO: use trait alias when stabilized
impl<T: Clone + crate::RangeModification<Key>, Key> RangeModification<Key> for T {}
#[derive(Debug)]
struct Node<Modification: RangeModification<Key>, Key> {
result: Modification::Result,
modify_children: Modification,
left: Option<Rc<Self>>,
right: Option<Rc<Self>>,
}
// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
impl<Modification: RangeModification<Key>, Key> Clone for Node<Modification, Key> {
fn clone(&self) -> Self {
Node {
result: self.result.clone(),
modify_children: self.modify_children.clone(),
left: self.left.clone(),
right: self.right.clone(),
}
}
}
impl<Modification: RangeModification<Key>, Key> Node<Modification, Key> {
fn new<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
range: &Range<Key>,
initializer: &Initializer,
) -> Self {
Node {
result: initializer.get(range),
modify_children: Modification::no_op(),
left: None,
right: None,
}
}
pub fn apply(&mut self, modification: &Modification, range: &Range<Key>) {
modification.apply(&mut self.result, range);
Modification::compose(modification, &mut self.modify_children);
if self.modify_children.is_reinitialization() {
self.left = None;
self.right = None;
}
}
pub fn force_children<Initializer: LazyRangeInitializer<Modification::Result, Key>>(
&mut self,
initializer: &Initializer,
range_left: &Range<Key>,
range_right: &Range<Key>,
) {
let left = Rc::make_mut(
self.left
.get_or_insert_with(|| Rc::new(Node::new(&range_left, initializer))),
);
let right = Rc::make_mut(
self.right
.get_or_insert_with(|| Rc::new(Node::new(&range_right, initializer))),
);
left.apply(&self.modify_children, &range_left);
right.apply(&self.modify_children, &range_right);
self.modify_children = Modification::no_op();
}
pub fn recalculate_from_children(&mut self, range_left: &Range<Key>, range_right: &Range<Key>) {
assert!(self.modify_children.is_no_op());
assert!(self.left.is_some());
assert!(self.right.is_some());
self.result = Modification::Result::combine(
&self.left.as_ref().unwrap().result,
&range_left,
&self.right.as_ref().unwrap().result,
&range_right,
);
}
}
fn split_range<Key: MidpointableKey>(range: &Range<Key>) -> (Range<Key>, Range<Key>) {
let range_left = range.start.clone()..MidpointableKey::midpoint(range);
let range_right = range_left.end.clone()..range.end.clone();
(range_left, range_right)
}
pub struct PersistentSegmentTreeVersion<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: Clone,
> {
root: Rc<Node<Modification, Key>>,
all_keys: Range<Key>,
initializer: Rc<Initializer>,
}
// Manual implementation because we don't need `Key: Clone` for this, unlike with `derive`.
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: Clone,
> Clone for PersistentSegmentTreeVersion<Modification, Initializer, Key>
{
fn clone(&self) -> Self {
Self {
root: self.root.clone(),
all_keys: self.all_keys.clone(),
initializer: self.initializer.clone(),
}
}
}
fn get<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
>(
node: &mut Rc<Node<Modification, Key>>,
node_keys: &Range<Key>,
initializer: &Initializer,
keys: &Range<Key>,
) -> Modification::Result {
if node_keys.end <= keys.start || keys.end <= node_keys.start {
return Modification::Result::new_for_empty_range();
}
if keys.start <= node_keys.start && node_keys.end <= keys.end {
return node.result.clone();
}
let node = Rc::make_mut(node);
let (left_keys, right_keys) = split_range(node_keys);
node.force_children(initializer, &left_keys, &right_keys);
let mut result = get(node.left.as_mut().unwrap(), &left_keys, initializer, keys);
Modification::Result::add(
&mut result,
&left_keys,
&get(node.right.as_mut().unwrap(), &right_keys, initializer, keys),
&right_keys,
);
result
}
fn modify<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
>(
node: &mut Rc<Node<Modification, Key>>,
node_keys: &Range<Key>,
initializer: &Initializer,
keys: &Range<Key>,
modification: &Modification,
) {
if modification.is_no_op() || node_keys.end <= keys.start || keys.end <= node_keys.start {
return;
}
let node = Rc::make_mut(node);
if keys.start <= node_keys.start && node_keys.end <= keys.end {
node.apply(modification, node_keys);
return;
}
let (left_keys, right_keys) = split_range(node_keys);
node.force_children(initializer, &left_keys, &right_keys);
modify(
node.left.as_mut().unwrap(),
&left_keys,
initializer,
keys,
&modification,
);
modify(
node.right.as_mut().unwrap(),
&right_keys,
initializer,
keys,
&modification,
);
node.recalculate_from_children(&left_keys, &right_keys);
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
> VecReadableVersion<Modification, Key>
for PersistentSegmentTreeVersion<Modification, Initializer, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
get(
&mut self.root.clone(), // TODO: do not always force a branch
&self.all_keys,
self.initializer.as_ref(),
keys,
)
}
}
pub struct PersistentSegmentTree<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
>(PersistentSegmentTreeVersion<Modification, Initializer, Key>);
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
> VecReadableVersion<Modification, Key>
for PersistentSegmentTree<Modification, Initializer, Key>
{
fn get(&self, keys: &Range<Key>) -> Modification::Result {
self.0.get(keys)
}
}
impl<
Modification: RangeModification<Key>,
Initializer: LazyRangeInitializer<Modification::Result, Key>,
Key: MidpointableKey,
> PersistentVecStorage<Modification, Initializer, Key>
for PersistentSegmentTree<Modification, Initializer, Key>
{
fn new(all_keys: Range<Key>, initializer: Initializer) -> Self {
PersistentSegmentTree(PersistentSegmentTreeVersion {
root: Rc::new(Node::new(&all_keys, &initializer)),
all_keys: all_keys,
initializer: Rc::new(initializer),
})
}
type FrozenVersion = PersistentSegmentTreeVersion<Modification, Initializer, Key>;
fn modify(&mut self, keys: &Range<Key>, modification: &Modification) {
modify(
&mut self.0.root, // TODO: do not always force a branch
&self.0.all_keys,
self.0.initializer.as_ref(),
keys,
modification,
)
}
fn freeze(&mut self) -> Self::FrozenVersion {
self.0.clone()
}
}

View File

@@ -1,295 +0,0 @@
use persistent_range_query::naive::{IndexableKey, NaiveVecStorage};
use persistent_range_query::ops::SameElementsInitializer;
use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
use persistent_range_query::{
LazyRangeInitializer, PersistentVecStorage, RangeModification, RangeQueryResult,
VecReadableVersion,
};
use std::cmp::Ordering;
use std::ops::Range;
#[derive(Clone, Copy, Debug, Eq, PartialEq, Ord, PartialOrd)]
struct PageIndex(u32);
type LayerId = String;
impl IndexableKey for PageIndex {
fn index(all_keys: &Range<Self>, key: &Self) -> usize {
(key.0 as usize) - (all_keys.start.0 as usize)
}
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
PageIndex(all_keys.start.0 + index as u32)..PageIndex(all_keys.start.0 + index as u32 + 1)
}
}
impl MidpointableKey for PageIndex {
fn midpoint(range: &Range<Self>) -> Self {
PageIndex(range.start.0 + (range.end.0 - range.start.0) / 2)
}
}
#[derive(Clone, Debug, Eq, PartialEq)]
struct LayerMapInformation {
// Only make sense for a range of length 1.
last_layer: Option<LayerId>,
last_image_layer: Option<LayerId>,
// Work for all ranges
max_delta_layers: (usize, Range<PageIndex>),
}
impl LayerMapInformation {
fn last_layers(&self) -> (&Option<LayerId>, &Option<LayerId>) {
(&self.last_layer, &self.last_image_layer)
}
fn max_delta_layers(&self) -> &(usize, Range<PageIndex>) {
&self.max_delta_layers
}
}
fn merge_ranges(left: &Range<PageIndex>, right: &Range<PageIndex>) -> Range<PageIndex> {
if left.is_empty() {
right.clone()
} else if right.is_empty() {
left.clone()
} else if left.end == right.start {
left.start..right.end
} else {
left.clone()
}
}
impl RangeQueryResult<PageIndex> for LayerMapInformation {
fn new_for_empty_range() -> Self {
LayerMapInformation {
last_layer: None,
last_image_layer: None,
max_delta_layers: (0, PageIndex(0)..PageIndex(0)),
}
}
fn combine(
left: &Self,
_left_range: &Range<PageIndex>,
right: &Self,
_right_range: &Range<PageIndex>,
) -> Self {
// Note that either range may be empty.
LayerMapInformation {
last_layer: left
.last_layer
.as_ref()
.or_else(|| right.last_layer.as_ref())
.cloned(),
last_image_layer: left
.last_image_layer
.as_ref()
.or_else(|| right.last_image_layer.as_ref())
.cloned(),
max_delta_layers: match left.max_delta_layers.0.cmp(&right.max_delta_layers.0) {
Ordering::Less => right.max_delta_layers.clone(),
Ordering::Greater => left.max_delta_layers.clone(),
Ordering::Equal => (
left.max_delta_layers.0,
merge_ranges(&left.max_delta_layers.1, &right.max_delta_layers.1),
),
},
}
}
fn add(
left: &mut Self,
left_range: &Range<PageIndex>,
right: &Self,
right_range: &Range<PageIndex>,
) {
*left = Self::combine(&left, left_range, right, right_range);
}
}
#[derive(Clone, Debug)]
struct AddDeltaLayers {
last_layer: LayerId,
count: usize,
}
#[derive(Clone, Debug)]
struct LayerMapModification {
add_image_layer: Option<LayerId>,
add_delta_layers: Option<AddDeltaLayers>,
}
impl LayerMapModification {
fn add_image_layer(layer: impl Into<LayerId>) -> Self {
LayerMapModification {
add_image_layer: Some(layer.into()),
add_delta_layers: None,
}
}
fn add_delta_layer(layer: impl Into<LayerId>) -> Self {
LayerMapModification {
add_image_layer: None,
add_delta_layers: Some(AddDeltaLayers {
last_layer: layer.into(),
count: 1,
}),
}
}
}
impl RangeModification<PageIndex> for LayerMapModification {
type Result = LayerMapInformation;
fn no_op() -> Self {
LayerMapModification {
add_image_layer: None,
add_delta_layers: None,
}
}
fn is_no_op(&self) -> bool {
self.add_image_layer.is_none() && self.add_delta_layers.is_none()
}
fn is_reinitialization(&self) -> bool {
self.add_image_layer.is_some()
}
fn apply(&self, result: &mut Self::Result, range: &Range<PageIndex>) {
if let Some(layer) = &self.add_image_layer {
result.last_layer = Some(layer.clone());
result.last_image_layer = Some(layer.clone());
result.max_delta_layers = (0, range.clone());
}
if let Some(AddDeltaLayers { last_layer, count }) = &self.add_delta_layers {
result.last_layer = Some(last_layer.clone());
result.max_delta_layers.0 += count;
}
}
fn compose(later: &Self, earlier: &mut Self) {
if later.add_image_layer.is_some() {
*earlier = later.clone();
return;
}
if let Some(AddDeltaLayers { last_layer, count }) = &later.add_delta_layers {
let res = earlier.add_delta_layers.get_or_insert(AddDeltaLayers {
last_layer: LayerId::default(),
count: 0,
});
res.last_layer = last_layer.clone();
res.count += count;
}
}
}
impl LazyRangeInitializer<LayerMapInformation, PageIndex> for SameElementsInitializer<()> {
fn get(&self, range: &Range<PageIndex>) -> LayerMapInformation {
LayerMapInformation {
last_layer: None,
last_image_layer: None,
max_delta_layers: (0, range.clone()),
}
}
}
fn test_layer_map<
S: PersistentVecStorage<LayerMapModification, SameElementsInitializer<()>, PageIndex>,
>() {
let mut s = S::new(
PageIndex(0)..PageIndex(100),
SameElementsInitializer::new(()),
);
s.modify(
&(PageIndex(0)..PageIndex(70)),
&LayerMapModification::add_image_layer("Img0..70"),
);
s.modify(
&(PageIndex(50)..PageIndex(100)),
&LayerMapModification::add_image_layer("Img50..100"),
);
s.modify(
&(PageIndex(10)..PageIndex(60)),
&LayerMapModification::add_delta_layer("Delta10..60"),
);
let s_before_last_delta = s.freeze();
s.modify(
&(PageIndex(20)..PageIndex(80)),
&LayerMapModification::add_delta_layer("Delta20..80"),
);
assert_eq!(
s.get(&(PageIndex(5)..PageIndex(6))).last_layers(),
(&Some("Img0..70".to_owned()), &Some("Img0..70".to_owned()))
);
assert_eq!(
s.get(&(PageIndex(15)..PageIndex(16))).last_layers(),
(
&Some("Delta10..60".to_owned()),
&Some("Img0..70".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(25)..PageIndex(26))).last_layers(),
(
&Some("Delta20..80".to_owned()),
&Some("Img0..70".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(65)..PageIndex(66))).last_layers(),
(
&Some("Delta20..80".to_owned()),
&Some("Img50..100".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(95)..PageIndex(96))).last_layers(),
(
&Some("Img50..100".to_owned()),
&Some("Img50..100".to_owned())
)
);
assert_eq!(
s.get(&(PageIndex(0)..PageIndex(100))).max_delta_layers(),
&(2, PageIndex(20)..PageIndex(60)),
);
assert_eq!(
*s_before_last_delta
.get(&(PageIndex(0)..PageIndex(100)))
.max_delta_layers(),
(1, PageIndex(10)..PageIndex(60)),
);
assert_eq!(
*s.get(&(PageIndex(10)..PageIndex(30))).max_delta_layers(),
(2, PageIndex(20)..PageIndex(30))
);
assert_eq!(
*s.get(&(PageIndex(10)..PageIndex(20))).max_delta_layers(),
(1, PageIndex(10)..PageIndex(20))
);
assert_eq!(
*s.get(&(PageIndex(70)..PageIndex(80))).max_delta_layers(),
(1, PageIndex(70)..PageIndex(80))
);
assert_eq!(
*s_before_last_delta
.get(&(PageIndex(70)..PageIndex(80)))
.max_delta_layers(),
(0, PageIndex(70)..PageIndex(80))
);
}
#[test]
fn test_naive() {
test_layer_map::<NaiveVecStorage<_, _, _>>();
}
#[test]
fn test_segment_tree() {
test_layer_map::<PersistentSegmentTree<_, _, _>>();
}

View File

@@ -1,116 +0,0 @@
use persistent_range_query::naive::*;
use persistent_range_query::ops::rsq::AddAssignModification::Add;
use persistent_range_query::ops::rsq::*;
use persistent_range_query::ops::SameElementsInitializer;
use persistent_range_query::segment_tree::{MidpointableKey, PersistentSegmentTree};
use persistent_range_query::{PersistentVecStorage, VecReadableVersion};
use rand::{Rng, SeedableRng};
use std::ops::Range;
#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
struct K(u16);
impl IndexableKey for K {
fn index(all_keys: &Range<Self>, key: &Self) -> usize {
(key.0 as usize) - (all_keys.start.0 as usize)
}
fn element_range(all_keys: &Range<Self>, index: usize) -> Range<Self> {
K(all_keys.start.0 + index as u16)..K(all_keys.start.0 + index as u16 + 1)
}
}
impl SumOfSameElements<K> for i32 {
fn sum(initial_element_value: &Self, keys: &Range<K>) -> Self {
initial_element_value * (keys.end.0 - keys.start.0) as Self
}
}
impl MidpointableKey for K {
fn midpoint(range: &Range<Self>) -> Self {
K(range.start.0 + (range.end.0 - range.start.0) / 2)
}
}
fn test_storage<
S: PersistentVecStorage<AddAssignModification<i32>, SameElementsInitializer<i32>, K>,
>() {
let mut s = S::new(K(0)..K(12), SameElementsInitializer::new(0i32));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 0);
s.modify(&(K(2)..K(5)), &AddAssignModification::Add(3));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 3 + 3);
let s_old = s.freeze();
s.modify(&(K(3)..K(6)), &AddAssignModification::Assign(10));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 10 + 10);
s.modify(&(K(4)..K(7)), &AddAssignModification::Add(2));
assert_eq!(*s.get(&(K(0)..K(12))).sum(), 3 + 10 + 12 + 12 + 2);
assert_eq!(*s.get(&(K(4)..K(6))).sum(), 12 + 12);
assert_eq!(*s_old.get(&(K(4)..K(6))).sum(), 3);
}
#[test]
fn test_naive() {
test_storage::<NaiveVecStorage<_, _, _>>();
}
#[test]
fn test_segment_tree() {
test_storage::<PersistentSegmentTree<_, _, _>>();
}
#[test]
fn test_stress() {
const LEN: u16 = 17_238;
const OPERATIONS: i32 = 20_000;
let mut rng = rand::rngs::StdRng::seed_from_u64(0);
let mut naive: NaiveVecStorage<AddAssignModification<i32>, _, _> =
NaiveVecStorage::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
let mut segm_tree: PersistentSegmentTree<AddAssignModification<i32>, _, _> =
PersistentSegmentTree::new(K(0)..K(LEN), SameElementsInitializer::new(2i32));
fn gen_range(rng: &mut impl Rng) -> Range<K> {
let l: u16 = rng.gen_range(0..LEN);
let r: u16 = rng.gen_range(0..LEN);
if l <= r {
K(l)..K(r)
} else {
K(r)..K(l)
}
}
for _ in 0..2 {
let checksum_range = gen_range(&mut rng);
let checksum_before: i32 = *naive.get(&checksum_range).sum();
assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
let naive_before = naive.freeze();
let segm_tree_before = segm_tree.freeze();
assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
assert_eq!(checksum_before, *segm_tree.get(&checksum_range).sum());
for _ in 0..OPERATIONS {
{
let range = gen_range(&mut rng);
assert_eq!(naive.get(&range).sum(), segm_tree.get(&range).sum());
}
{
let range = gen_range(&mut rng);
let val = rng.gen_range(-10i32..=10i32);
let op = Add(val);
naive.modify(&range, &op);
segm_tree.modify(&range, &op);
}
}
assert_eq!(checksum_before, *naive_before.get(&checksum_range).sum());
assert_eq!(
checksum_before,
*segm_tree_before.get(&checksum_range).sum()
);
}
}

View File

@@ -33,8 +33,8 @@ pub struct Segment {
/// Logical size before this state
start_size: u64,
/// Logical size at this state. Can be None in the last Segment of a branch.
pub end_size: Option<u64>,
/// Logical size at this state
pub end_size: u64,
/// Indices to [`Storage::segments`]
///
@@ -115,7 +115,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
start_lsn: 0,
end_lsn: 0,
start_size: 0,
end_size: Some(0),
end_size: 0,
children_after: Vec::new(),
};
@@ -125,39 +125,6 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
}
}
/// Advances the branch with a new point, at given LSN.
pub fn insert_point<Q: ?Sized>(
&mut self,
branch: &Q,
op: Cow<'static, str>,
lsn: u64,
size: Option<u64>,
) where
K: std::borrow::Borrow<Q>,
Q: std::hash::Hash + Eq,
{
let lastseg_id = *self.branches.get(branch).unwrap();
let newseg_id = self.segments.len();
let lastseg = &mut self.segments[lastseg_id];
assert!(lsn > lastseg.end_lsn);
let newseg = Segment {
op,
parent: Some(lastseg_id),
start_lsn: lastseg.end_lsn,
end_lsn: lsn,
start_size: lastseg.end_size.unwrap(),
end_size: size,
children_after: Vec::new(),
needed: false,
};
lastseg.children_after.push(newseg_id);
self.segments.push(newseg);
*self.branches.get_mut(branch).expect("read already") = newseg_id;
}
/// Advances the branch with the named operation, by the relative LSN and logical size bytes.
pub fn modify_branch<Q: ?Sized>(
&mut self,
@@ -178,8 +145,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
parent: Some(lastseg_id),
start_lsn: lastseg.end_lsn,
end_lsn: lastseg.end_lsn + lsn_bytes,
start_size: lastseg.end_size.unwrap(),
end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
start_size: lastseg.end_size,
end_size: (lastseg.end_size as i64 + size_bytes) as u64,
children_after: Vec::new(),
needed: false,
};
@@ -354,7 +321,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
Some(SegmentSize {
seg_id,
method: SnapshotAfter,
this_size: seg.end_size.unwrap(),
this_size: seg.end_size,
children,
})
} else {

View File

@@ -174,7 +174,7 @@ fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
let seg_id = node.seg_id;
let seg = segments.get(seg_id).unwrap();
let lsn = seg.end_lsn;
let size = seg.end_size.unwrap_or(0);
let size = seg.end_size;
let method = node.method;
println!(" {{");
@@ -226,7 +226,7 @@ fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
print!(
" label=\"{} / {}\"",
next.end_lsn - seg.end_lsn,
(next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
(next.end_size as i128 - seg.end_size as i128)
);
} else {
print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);

View File

@@ -204,17 +204,6 @@ pub struct TenantId(Id);
id_newtype!(TenantId);
/// Neon Connection Id identifies long-lived connections (for example a pagestream
/// connection with the page_service). Is used for better logging and tracing
///
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
/// See [`Id`] for alternative ways to serialize it.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
pub struct ConnectionId(Id);
id_newtype!(ConnectionId);
// A pair uniquely identifying Neon instance.
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TenantTimelineId {

View File

@@ -48,25 +48,6 @@ pub mod nonblock;
// Default signal handling
pub mod signals;
/// use with fail::cfg("$name", "return(2000)")
#[macro_export]
macro_rules! failpoint_sleep_millis_async {
($name:literal) => {{
let should_sleep: Option<std::time::Duration> = (|| {
fail::fail_point!($name, |v: Option<_>| {
let millis = v.unwrap().parse::<u64>().unwrap();
Some(Duration::from_millis(millis))
});
None
})();
if let Some(d) = should_sleep {
tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
tokio::time::sleep(d).await;
tracing::info!("failpoint {:?}: sleep done", $name);
}
}};
}
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -138,7 +138,7 @@ impl FromStr for Lsn {
///
/// If the input string is missing the '/' character, then use `Lsn::from_hex`
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut splitter = s.trim().split('/');
let mut splitter = s.split('/');
if let (Some(left), Some(right), None) = (splitter.next(), splitter.next(), splitter.next())
{
let left_num = u32::from_str_radix(left, 16).map_err(|_| LsnParseError)?;
@@ -270,11 +270,6 @@ mod tests {
);
assert_eq!(Lsn::from_hex("0"), Ok(Lsn(0)));
assert_eq!(Lsn::from_hex("F12345678AAAA5555"), Err(LsnParseError));
let expected_lsn = Lsn(0x3C490F8);
assert_eq!(" 0/3C490F8".parse(), Ok(expected_lsn));
assert_eq!("0/3C490F8 ".parse(), Ok(expected_lsn));
assert_eq!(" 0/3C490F8 ".parse(), Ok(expected_lsn));
}
#[test]

View File

@@ -76,7 +76,3 @@ tempfile = "3.2"
[[bench]]
name = "bench_layer_map"
harness = false
[[bench]]
name = "bench_walredo"
harness = false

File diff suppressed because one or more lines are too long

View File

@@ -199,20 +199,6 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
logging::init(conf.log_format)?;
info!("version: {}", version());
// If any failpoints were set from FAILPOINTS environment variable,
// print them to the log for debugging purposes
let failpoints = fail::list();
if !failpoints.is_empty() {
info!(
"started with failpoints: {}",
failpoints
.iter()
.map(|(name, actions)| format!("{name}={actions}"))
.collect::<Vec<String>>()
.join(";")
)
}
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
let lock_file = match lock_file::create_lock_file(&lock_file_path, Pid::this().to_string()) {
lock_file::LockCreationResult::Created {

View File

@@ -8,7 +8,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use remote_storage::RemoteStorageConfig;
use std::env;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use std::num::NonZeroUsize;
use std::path::{Path, PathBuf};
@@ -415,22 +414,6 @@ impl PageServerConf {
)
}
pub fn traces_path(&self) -> PathBuf {
self.workdir.join("traces")
}
pub fn trace_path(
&self,
tenant_id: &TenantId,
timeline_id: &TimelineId,
connection_id: &ConnectionId,
) -> PathBuf {
self.traces_path()
.join(tenant_id.to_string())
.join(timeline_id.to_string())
.join(connection_id.to_string())
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
@@ -614,9 +597,8 @@ impl PageServerConf {
PathBuf::from(format!("../tmp_check/test_{test_name}"))
}
#[cfg(test)]
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
PageServerConf {
id: NodeId(0),
wait_lsn_timeout: Duration::from_secs(60),
@@ -627,7 +609,7 @@ impl PageServerConf {
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
superuser: "cloud_admin".to_string(),
workdir: repo_dir,
pg_distrib_dir,
pg_distrib_dir: PathBuf::new(),
auth_type: AuthType::Trust,
auth_validation_public_key_path: None,
remote_storage_config: None,

View File

@@ -667,7 +667,6 @@ components:
- disk_consistent_lsn
- awaits_download
- state
- latest_gc_cutoff_lsn
properties:
timeline_id:
type: string
@@ -712,9 +711,6 @@ components:
type: boolean
state:
type: string
latest_gc_cutoff_lsn:
type: string
format: hex
# These 'local' and 'remote' fields just duplicate some of the fields
# above. They are kept for backwards-compatibility. They can be removed,

View File

@@ -618,7 +618,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
check_permission(&request, None)?;
let request_data: TenantCreateRequest = json_request(&mut request).await?;
println!("tenant create: {:?}", request_data.trace_read_requests);
let remote_index = get_state(&request).remote_index.clone();
let mut tenant_conf = TenantConfOpt::default();
@@ -660,9 +659,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -750,9 +746,6 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
}
if let Some(trace_read_requests) = request_data.trace_read_requests {
tenant_conf.trace_read_requests = Some(trace_read_requests);
}
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
@@ -832,14 +825,14 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
let tenant = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
let _span_guard =
info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
// Use tenant's pitr setting
let pitr = tenant.get_pitr_interval();
let result = tenant
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
.instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
.await
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
// better once the types support it.
.map_err(ApiError::InternalServerError)?;
@@ -875,7 +868,6 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
.map_err(ApiError::NotFound)?;
timeline
.checkpoint(CheckpointConfig::Forced)
.await
.map_err(ApiError::InternalServerError)?;
json_response(StatusCode::OK, ())

View File

@@ -15,7 +15,6 @@ pub mod tenant;
pub mod tenant_config;
pub mod tenant_mgr;
pub mod tenant_tasks;
pub mod trace;
pub mod virtual_file;
pub mod walingest;
pub mod walreceiver;

View File

@@ -25,11 +25,9 @@ use std::net::TcpListener;
use std::str;
use std::str::FromStr;
use std::sync::Arc;
use tokio::pin;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tracing::*;
use utils::id::ConnectionId;
use utils::{
auth::{self, Claims, JwtAuth, Scope},
id::{TenantId, TimelineId},
@@ -48,7 +46,6 @@ use crate::task_mgr;
use crate::task_mgr::TaskKind;
use crate::tenant::Timeline;
use crate::tenant_mgr;
use crate::trace::Tracer;
use crate::CheckpointConfig;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -76,12 +73,6 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
FeMessage::CopyData(bytes) => bytes,
FeMessage::CopyDone => { break },
FeMessage::Sync => continue,
FeMessage::Terminate => {
let msg = format!("client terminated connection with Terminate message during COPY");
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
break;
}
m => {
let msg = format!("unexpected message {:?}", m);
pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
@@ -93,10 +84,10 @@ fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Byt
yield copy_data_bytes;
}
Ok(None) => {
let msg = "client closed connection during COPY";
let msg = "client closed connection";
pgb.write_message(&BeMessage::ErrorResponse(msg))?;
pgb.flush().await?;
Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
Err(io::Error::new(io::ErrorKind::Other, msg))?;
}
Err(e) => {
Err(io::Error::new(io::ErrorKind::Other, e))?;
@@ -277,18 +268,6 @@ impl PageServerHandler {
// so there is no need to reset the association
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
// Make request tracer if needed
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path = tenant
.conf
.trace_path(&tenant_id, &timeline_id, &connection_id);
Some(Tracer::new(path))
} else {
None
};
// Check that the timeline exists
let timeline = get_local_timeline(tenant_id, timeline_id)?;
@@ -321,11 +300,6 @@ impl PageServerHandler {
trace!("query: {copy_data_bytes:?}");
// Trace request if needed
if let Some(t) = tracer.as_mut() {
t.trace(&copy_data_bytes)
}
let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
let response = match neon_fe_msg {
@@ -393,12 +367,14 @@ impl PageServerHandler {
pgb.write_message(&BeMessage::CopyInResponse)?;
pgb.flush().await?;
let copyin_stream = copyin_stream(pgb);
pin!(copyin_stream);
timeline
.import_basebackup_from_tar(&mut copyin_stream, base_lsn)
.await?;
// import_basebackup_from_tar() is not async, mainly because the Tar crate
// it uses is not async. So we need to jump through some hoops:
// - convert the input from client connection to a synchronous Read
// - use block_in_place()
let mut copyin_stream = Box::pin(copyin_stream(pgb));
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| timeline.import_basebackup_from_tar(reader, base_lsn))?;
timeline.initialize()?;
// Drain the rest of the Copy data
let mut bytes_after_tar = 0;
@@ -463,7 +439,7 @@ impl PageServerHandler {
// We only want to persist the data, and it doesn't matter if it's in the
// shape of deltas or images.
info!("flushing layers");
timeline.checkpoint(CheckpointConfig::Flush).await?;
timeline.checkpoint(CheckpointConfig::Flush)?;
info!("done");
Ok(())

View File

@@ -12,12 +12,8 @@
//!
use anyhow::{bail, Context};
use bytes::Bytes;
use futures::Stream;
use pageserver_api::models::TimelineState;
use tokio::sync::watch;
use tokio_util::io::StreamReader;
use tokio_util::io::SyncIoBridge;
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
@@ -33,7 +29,6 @@ use std::io::Write;
use std::ops::Bound::Included;
use std::path::Path;
use std::path::PathBuf;
use std::pin::Pin;
use std::process::Command;
use std::process::Stdio;
use std::sync::Arc;
@@ -142,7 +137,7 @@ pub struct Tenant {
pub struct UninitializedTimeline<'t> {
owning_tenant: &'t Tenant,
timeline_id: TimelineId,
raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
raw_timeline: Option<(Timeline, TimelineUninitMark)>,
}
/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
@@ -174,6 +169,7 @@ impl UninitializedTimeline<'_> {
let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
})?;
let new_timeline = Arc::new(new_timeline);
let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
// TODO it would be good to ensure that, but apparently a lot of our testing is dependend on that at least
@@ -201,9 +197,6 @@ impl UninitializedTimeline<'_> {
})?;
new_timeline.set_state(TimelineState::Active);
v.insert(Arc::clone(&new_timeline));
new_timeline.maybe_spawn_flush_loop();
new_timeline.launch_wal_receiver();
}
}
@@ -212,28 +205,20 @@ impl UninitializedTimeline<'_> {
}
/// Prepares timeline data by loading it from the basebackup archive.
pub async fn import_basebackup_from_tar(
self,
mut copyin_stream: &mut Pin<&mut impl Stream<Item = io::Result<Bytes>>>,
pub fn import_basebackup_from_tar(
&self,
reader: impl std::io::Read,
base_lsn: Lsn,
) -> anyhow::Result<Arc<Timeline>> {
) -> anyhow::Result<()> {
let raw_timeline = self.raw_timeline()?;
// import_basebackup_from_tar() is not async, mainly because the Tar crate
// it uses is not async. So we need to jump through some hoops:
// - convert the input from client connection to a synchronous Read
// - use block_in_place()
let reader = SyncIoBridge::new(StreamReader::new(&mut copyin_stream));
tokio::task::block_in_place(|| {
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn)
.context("Failed to import basebackup")
})?;
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
// We want to run proper checkpoint before we mark timeline as available to outside world
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
raw_timeline.maybe_spawn_flush_loop();
import_datadir::import_basebackup_from_tar(raw_timeline, reader, base_lsn).with_context(
|| {
format!(
"Failed to import basebackup for timeline {}/{}",
self.owning_tenant.tenant_id, self.timeline_id
)
},
)?;
fail::fail_point!("before-checkpoint-new-timeline", |_| {
bail!("failpoint before-checkpoint-new-timeline");
@@ -241,15 +226,16 @@ impl UninitializedTimeline<'_> {
raw_timeline
.checkpoint(CheckpointConfig::Flush)
.await
.context("Failed to checkpoint after basebackup import")?;
let timeline = self.initialize()?;
Ok(timeline)
.with_context(|| {
format!(
"Failed to checkpoint after basebackup import for timeline {}/{}",
self.owning_tenant.tenant_id, self.timeline_id
)
})?;
Ok(())
}
fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
fn raw_timeline(&self) -> anyhow::Result<&Timeline> {
Ok(&self
.raw_timeline
.as_ref()
@@ -461,7 +447,14 @@ impl Tenant {
.context("Cannot branch off the timeline that's not present in pageserver")?;
if let Some(lsn) = ancestor_start_lsn.as_mut() {
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
*lsn = lsn.align();
ancestor_timeline.wait_lsn(*lsn).await?;
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
if ancestor_ancestor_lsn > *lsn {
@@ -473,19 +466,11 @@ impl Tenant {
ancestor_ancestor_lsn,
);
}
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
ancestor_timeline.wait_lsn(*lsn).await?;
}
self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
}
None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
None => self.bootstrap_timeline(new_timeline_id, pg_version)?,
};
// Have added new timeline into the tenant, now its background tasks are needed.
@@ -503,7 +488,7 @@ impl Tenant {
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
/// to make tests more deterministic.
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
pub async fn gc_iteration(
pub fn gc_iteration(
&self,
target_timeline_id: Option<TimelineId>,
horizon: u64,
@@ -519,13 +504,11 @@ impl Tenant {
.map(|x| x.to_string())
.unwrap_or_else(|| "-".to_string());
{
let _timer = STORAGE_TIME
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
.start_timer();
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
.await
}
STORAGE_TIME
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
.observe_closure_duration(|| {
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
})
}
/// Perform one compaction iteration.
@@ -561,24 +544,23 @@ impl Tenant {
///
/// Used at graceful shutdown.
///
pub async fn checkpoint(&self) -> anyhow::Result<()> {
pub fn checkpoint(&self) -> anyhow::Result<()> {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// checkpoints. We don't want to block everything else while the
// checkpoint runs.
let timelines_to_checkpoint = {
let timelines = self.timelines.lock().unwrap();
timelines
.iter()
.map(|(id, timeline)| (*id, Arc::clone(timeline)))
.collect::<Vec<_>>()
};
let timelines = self.timelines.lock().unwrap();
let timelines_to_checkpoint = timelines
.iter()
.map(|(timeline_id, timeline)| (*timeline_id, Arc::clone(timeline)))
.collect::<Vec<_>>();
drop(timelines);
for (id, timeline) in &timelines_to_checkpoint {
timeline
.checkpoint(CheckpointConfig::Flush)
.instrument(info_span!("checkpoint", timeline = %id, tenant = %self.tenant_id))
.await?;
for (timeline_id, timeline) in &timelines_to_checkpoint {
let _entered =
info_span!("checkpoint", timeline = %timeline_id, tenant = %self.tenant_id)
.entered();
timeline.checkpoint(CheckpointConfig::Flush)?;
}
Ok(())
@@ -807,13 +789,6 @@ impl Tenant {
.unwrap_or(self.conf.default_tenant_conf.pitr_interval)
}
pub fn get_trace_read_requests(&self) -> bool {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
.trace_read_requests
.unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
}
pub fn update_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
self.tenant_conf.write().unwrap().update(&new_tenant_conf);
}
@@ -999,7 +974,7 @@ impl Tenant {
// - if a relation has a non-incremental persistent layer on a child branch, then we
// don't need to keep that in the parent anymore. But currently
// we do.
async fn gc_iteration_internal(
fn gc_iteration_internal(
&self,
target_timeline_id: Option<TimelineId>,
horizon: u64,
@@ -1011,10 +986,6 @@ impl Tenant {
let gc_timelines = self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)?;
utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
info!("starting on {} timelines", gc_timelines.len());
// Perform GC for each timeline.
//
// Note that we don't hold the GC lock here because we don't want
@@ -1036,7 +1007,7 @@ impl Tenant {
// so that they too can be garbage collected. That's
// used in tests, so we want as deterministic results as possible.
if checkpoint_before_gc {
timeline.checkpoint(CheckpointConfig::Forced).await?;
timeline.checkpoint(CheckpointConfig::Forced)?;
info!(
"timeline {} checkpoint_before_gc done",
timeline.timeline_id
@@ -1146,6 +1117,7 @@ impl Tenant {
}
}
drop(gc_cs);
Ok(gc_timelines)
}
@@ -1250,15 +1222,14 @@ impl Tenant {
/// - run initdb to init temporary instance and get bootstrap data
/// - after initialization complete, remove the temp dir.
async fn bootstrap_timeline(
fn bootstrap_timeline(
&self,
timeline_id: TimelineId,
pg_version: u32,
) -> anyhow::Result<Arc<Timeline>> {
let timeline_uninit_mark = {
let timelines = self.timelines.lock().unwrap();
self.create_timeline_uninit_mark(timeline_id, &timelines)?
};
let timelines = self.timelines.lock().unwrap();
let timeline_uninit_mark = self.create_timeline_uninit_mark(timeline_id, &timelines)?;
drop(timelines);
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
// temporary directory for basebackup files for the given timeline.
let initdb_path = path_with_suffix_extension(
@@ -1308,35 +1279,25 @@ impl Tenant {
let tenant_id = raw_timeline.owning_tenant.tenant_id;
let unfinished_timeline = raw_timeline.raw_timeline()?;
tokio::task::block_in_place(|| {
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
pgdata_path,
pgdata_lsn,
)
})
import_datadir::import_timeline_from_postgres_datadir(
unfinished_timeline,
pgdata_path,
pgdata_lsn,
)
.with_context(|| {
format!("Failed to import pgdatadir for timeline {tenant_id}/{timeline_id}")
})?;
// Flush loop needs to be spawned in order for checkpoint to be able to flush.
// We want to run proper checkpoint before we mark timeline as available to outside world
// Thus spawning flush loop manually and skipping flush_loop setup in initialize_with_lock
unfinished_timeline.maybe_spawn_flush_loop();
fail::fail_point!("before-checkpoint-new-timeline", |_| {
anyhow::bail!("failpoint before-checkpoint-new-timeline");
});
unfinished_timeline
.checkpoint(CheckpointConfig::Forced).await
.checkpoint(CheckpointConfig::Forced)
.with_context(|| format!("Failed to checkpoint after pgdatadir import for timeline {tenant_id}/{timeline_id}"))?;
let timeline = {
let mut timelines = self.timelines.lock().unwrap();
raw_timeline.initialize_with_lock(&mut timelines, false)?
};
let mut timelines = self.timelines.lock().unwrap();
let timeline = raw_timeline.initialize_with_lock(&mut timelines, false)?;
drop(timelines);
info!(
"created root timeline {} timeline.lsn {}",
@@ -1376,7 +1337,7 @@ impl Tenant {
Ok(UninitializedTimeline {
owning_tenant: self,
timeline_id: new_timeline_id,
raw_timeline: Some((Arc::new(new_timeline), uninit_mark)),
raw_timeline: Some((new_timeline, uninit_mark)),
})
}
Err(e) => {
@@ -1495,7 +1456,7 @@ impl Tenant {
let timeline = UninitializedTimeline {
owning_tenant: self,
timeline_id,
raw_timeline: Some((Arc::new(dummy_timeline), TimelineUninitMark::dummy())),
raw_timeline: Some((dummy_timeline, TimelineUninitMark::dummy())),
};
match timeline.initialize_with_lock(&mut timelines_accessor, true) {
Ok(initialized_timeline) => {
@@ -1678,7 +1639,6 @@ pub mod harness {
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
trace_read_requests: Some(tenant_conf.trace_read_requests),
}
}
}
@@ -1950,7 +1910,7 @@ mod tests {
Ok(())
}
async fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> anyhow::Result<()> {
let mut lsn = start_lsn;
#[allow(non_snake_case)]
{
@@ -1971,7 +1931,7 @@ mod tests {
writer.finish_write(lsn);
lsn += 0x10;
}
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
{
let writer = tline.writer();
writer.put(
@@ -1988,26 +1948,24 @@ mod tests {
)?;
writer.finish_write(lsn);
}
tline.checkpoint(CheckpointConfig::Forced).await
tline.checkpoint(CheckpointConfig::Forced)
}
#[tokio::test]
async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
#[test]
fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
let tenant =
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
// and compaction works. But it does set the 'cutoff' point so that the cross check
// below should fail.
tenant
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
.await?;
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
// try to branch at lsn 25, should fail because we already garbage collected the data
match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
@@ -2052,14 +2010,14 @@ mod tests {
/*
// FIXME: This currently fails to error out. Calling GC doesn't currently
// remove the old value, we'd need to work a little harder
#[tokio::test]
async fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
#[test]
fn test_prohibit_get_for_garbage_collected_data() -> anyhow::Result<()> {
let repo =
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
@@ -2072,47 +2030,43 @@ mod tests {
}
*/
#[tokio::test]
async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
#[test]
fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
let tenant =
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = tenant
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
tenant
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
.await?;
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
Ok(())
}
#[tokio::test]
async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
#[test]
fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
let tenant =
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = tenant
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
// run gc on parent
tenant
.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)
.await?;
tenant.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
// Check that the data is still accessible on the branch.
assert_eq!(
@@ -2123,8 +2077,8 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn timeline_load() -> anyhow::Result<()> {
#[test]
fn timeline_load() -> anyhow::Result<()> {
const TEST_NAME: &str = "timeline_load";
let harness = TenantHarness::create(TEST_NAME)?;
{
@@ -2132,8 +2086,8 @@ mod tests {
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
tline.checkpoint(CheckpointConfig::Forced).await?;
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
tline.checkpoint(CheckpointConfig::Forced)?;
}
let tenant = harness.load();
@@ -2144,8 +2098,8 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
#[test]
fn timeline_load_with_ancestor() -> anyhow::Result<()> {
const TEST_NAME: &str = "timeline_load_with_ancestor";
let harness = TenantHarness::create(TEST_NAME)?;
// create two timelines
@@ -2155,8 +2109,8 @@ mod tests {
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
.initialize()?;
make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
tline.checkpoint(CheckpointConfig::Forced).await?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
tline.checkpoint(CheckpointConfig::Forced)?;
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
@@ -2164,8 +2118,8 @@ mod tests {
.get_timeline(NEW_TIMELINE_ID, true)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
tline.checkpoint(CheckpointConfig::Forced).await?;
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
tline.checkpoint(CheckpointConfig::Forced)?;
}
// check that both of them are initially unloaded
@@ -2225,8 +2179,8 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_images() -> anyhow::Result<()> {
#[test]
fn test_images() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_images")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2237,7 +2191,7 @@ mod tests {
writer.finish_write(Lsn(0x10));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
let writer = tline.writer();
@@ -2245,7 +2199,7 @@ mod tests {
writer.finish_write(Lsn(0x20));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
let writer = tline.writer();
@@ -2253,7 +2207,7 @@ mod tests {
writer.finish_write(Lsn(0x30));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
let writer = tline.writer();
@@ -2261,7 +2215,7 @@ mod tests {
writer.finish_write(Lsn(0x40));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
@@ -2277,8 +2231,8 @@ mod tests {
// Insert 1000 key-value pairs with increasing keys, checkpoint,
// repeat 50 times.
//
#[tokio::test]
async fn test_bulk_insert() -> anyhow::Result<()> {
#[test]
fn test_bulk_insert() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_bulk_insert")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2311,7 +2265,7 @@ mod tests {
let cutoff = tline.get_last_record_lsn();
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
tline.gc()?;
}
@@ -2319,8 +2273,8 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_random_updates() -> anyhow::Result<()> {
#[test]
fn test_random_updates() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_random_updates")?.load();
let tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2383,7 +2337,7 @@ mod tests {
println!("checkpointing {}", lsn);
let cutoff = tline.get_last_record_lsn();
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
tline.gc()?;
}
@@ -2391,8 +2345,8 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn test_traverse_branches() -> anyhow::Result<()> {
#[test]
fn test_traverse_branches() -> anyhow::Result<()> {
let tenant = TenantHarness::create("test_traverse_branches")?.load();
let mut tline = tenant
.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
@@ -2464,7 +2418,7 @@ mod tests {
println!("checkpointing {}", lsn);
let cutoff = tline.get_last_record_lsn();
tline.update_gc_info(Vec::new(), cutoff, Duration::ZERO)?;
tline.checkpoint(CheckpointConfig::Forced).await?;
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
tline.gc()?;
}

View File

@@ -74,7 +74,6 @@ where
};
dstbuf.clear();
dstbuf.reserve(len);
// Read the payload
let mut remain = len;

View File

@@ -260,9 +260,8 @@ impl Layer for DeltaLayer {
// Ok, 'offsets' now contains the offsets of all the entries we need to read
let mut cursor = file.block_cursor();
let mut buf = Vec::new();
for (entry_lsn, pos) in offsets {
cursor.read_blob_into_buf(pos, &mut buf).with_context(|| {
let buf = cursor.read_blob(pos).with_context(|| {
format!(
"Failed to read blob from virtual file {}",
file.file.path.display()

View File

@@ -1,4 +1,3 @@
use std::cmp;
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
@@ -41,21 +40,6 @@ struct TimelineInputs {
next_gc_cutoff: Lsn,
}
/// Gathers the inputs for the tenant sizing model.
///
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
/// is updated on-demand, during the start of this calculation and separate from the
/// [`Timeline::latest_gc_cutoff`].
///
/// For timelines in general:
///
/// ```ignore
/// 0-----|---------|----|------------| · · · · · |·> lsn
/// initdb_lsn branchpoints* next_gc_cutoff latest
/// ```
///
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
/// tenant size will be zero.
pub(super) async fn gather_inputs(
tenant: &Tenant,
limit: &Arc<Semaphore>,
@@ -104,18 +88,13 @@ pub(super) async fn gather_inputs(
let gc_info = timeline.gc_info.read().unwrap();
// similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
// new gc run, which we have no control over. however differently from `Timeline::gc`
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
// actually removing files.
let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
// new gc run, which we have no control over.
// maybe this should be moved to gc_info.next_gc_cutoff()?
let next_gc_cutoff = std::cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
// the minimum where we should find the next_gc_cutoff for our calculations.
//
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
// want to query any logical size before initdb_lsn.
let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
let maybe_cutoff = if next_gc_cutoff > timeline.get_ancestor_lsn() {
// only include these if they are after branching point; otherwise we would end up
// with duplicate updates before the actual branching.
Some((next_gc_cutoff, LsnKind::GcCutOff))
} else {
None
@@ -183,19 +162,6 @@ pub(super) async fn gather_inputs(
}
}
// all timelines also have an end point if they have made any progress
if last_record_lsn > timeline.get_ancestor_lsn()
&& !interesting_lsns
.iter()
.any(|(lsn, _)| lsn == &last_record_lsn)
{
updates.push(Update {
lsn: last_record_lsn,
command: Command::EndOfBranch,
timeline_id: timeline.timeline_id,
});
}
timeline_inputs.insert(
timeline.timeline_id,
TimelineInputs {
@@ -283,22 +249,48 @@ impl ModelInputs {
// impossible to always determine the a one main branch.
let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
// tracking these not to require modifying the current implementation of the size model,
// which works in relative LSNs and sizes.
let mut last_state: HashMap<TimelineId, (Lsn, u64)> = HashMap::new();
for update in &self.updates {
let Update {
lsn,
command: op,
timeline_id,
} = update;
let Lsn(now) = *lsn;
match op {
Command::Update(sz) => {
storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
}
Command::EndOfBranch => {
storage.insert_point(&Some(*timeline_id), "".into(), now, None);
let latest = last_state.get_mut(timeline_id).ok_or_else(|| {
anyhow::anyhow!(
"ordering-mismatch: there must had been a previous state for {timeline_id}"
)
})?;
let lsn_bytes = {
let Lsn(now) = lsn;
let Lsn(prev) = latest.0;
debug_assert!(prev <= *now, "self.updates should had been sorted");
now - prev
};
let size_diff =
i64::try_from(*sz as i128 - latest.1 as i128).with_context(|| {
format!("size difference i64 overflow for {timeline_id}")
})?;
storage.modify_branch(&Some(*timeline_id), "".into(), lsn_bytes, size_diff);
*latest = (*lsn, *sz);
}
Command::BranchFrom(parent) => {
storage.branch(parent, Some(*timeline_id));
let size = parent
.as_ref()
.and_then(|id| last_state.get(id))
.map(|x| x.1)
.unwrap_or(0);
last_state.insert(*timeline_id, (*lsn, size));
}
}
}
@@ -307,7 +299,10 @@ impl ModelInputs {
}
}
/// A point of interest in the tree of branches
/// Single size model update.
///
/// Sizing model works with relative increments over latest branch state.
/// Updates are absolute, so additional state needs to be tracked when applying.
#[serde_with::serde_as]
#[derive(
Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
@@ -326,7 +321,6 @@ struct Update {
enum Command {
Update(u64),
BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
EndOfBranch,
}
impl std::fmt::Debug for Command {
@@ -336,7 +330,6 @@ impl std::fmt::Debug for Command {
match self {
Self::Update(arg0) => write!(f, "Update({arg0})"),
Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
Self::EndOfBranch => write!(f, "EndOfBranch"),
}
}
}

View File

@@ -16,7 +16,7 @@ use std::fs;
use std::ops::{Deref, Range};
use std::path::PathBuf;
use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
use std::time::{Duration, Instant, SystemTime};
use crate::tenant::{
@@ -61,13 +61,6 @@ use crate::{
storage_sync::{self, index::LayerFileMetadata},
};
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
enum FlushLoopState {
NotStarted,
Running,
Exited,
}
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -128,16 +121,8 @@ pub struct Timeline {
/// to avoid deadlock.
write_lock: Mutex<()>,
/// Used to avoid multiple `flush_loop` tasks running
flush_loop_state: Mutex<FlushLoopState>,
/// layer_flush_start_tx can be used to wake up the layer-flushing task.
/// The value is a counter, incremented every time a new flush cycle is requested.
/// The flush cycle counter is sent back on the layer_flush_done channel when
/// the flush finishes. You can use that to wait for the flush to finish.
layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
/// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
/// Used to ensure that there is only task performing flushing at a time
layer_flush_lock: Mutex<()>,
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -481,16 +466,15 @@ impl Timeline {
///
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
/// know anything about them here in the repository.
#[instrument(skip(self), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id))]
pub async fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
match cconf {
CheckpointConfig::Flush => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers_and_wait().await
self.flush_frozen_layers(true)
}
CheckpointConfig::Forced => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers_and_wait().await?;
self.flush_frozen_layers(true)?;
self.compact()
}
}
@@ -640,8 +624,24 @@ impl Timeline {
self.last_freeze_at.store(last_lsn);
*(self.last_freeze_ts.write().unwrap()) = Instant::now();
// Wake up the layer flusher
self.flush_frozen_layers();
// Launch a task to flush the frozen layer to disk, unless
// a task was already running. (If the task was running
// at the time that we froze the layer, it must've seen the
// the layer we just froze before it exited; see comments
// in flush_frozen_layers())
if let Ok(guard) = self.layer_flush_lock.try_lock() {
drop(guard);
let self_clone = Arc::clone(self);
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
Some(self.tenant_id),
Some(self.timeline_id),
"layer flush task",
false,
async move { self_clone.flush_frozen_layers(false) },
);
}
}
}
Ok(())
@@ -732,9 +732,6 @@ impl Timeline {
let disk_consistent_lsn = metadata.disk_consistent_lsn();
let (state, _) = watch::channel(TimelineState::Suspended);
let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
let mut result = Timeline {
conf,
tenant_conf,
@@ -762,12 +759,8 @@ impl Timeline {
upload_layers: AtomicBool::new(upload_layers),
flush_loop_state: Mutex::new(FlushLoopState::NotStarted),
layer_flush_start_tx,
layer_flush_done_tx,
write_lock: Mutex::new(()),
layer_flush_lock: Mutex::new(()),
layer_removal_cs: Mutex::new(()),
gc_info: RwLock::new(GcInfo {
@@ -800,48 +793,6 @@ impl Timeline {
result
}
pub(super) fn maybe_spawn_flush_loop(self: &Arc<Self>) {
let mut flush_loop_state = self.flush_loop_state.lock().unwrap();
match *flush_loop_state {
FlushLoopState::NotStarted => (),
FlushLoopState::Running => {
info!(
"skipping attempt to start flush_loop twice {}/{}",
self.tenant_id, self.timeline_id
);
return;
}
FlushLoopState::Exited => {
warn!(
"ignoring attempt to restart exited flush_loop {}/{}",
self.tenant_id, self.timeline_id
);
return;
}
}
let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
let self_clone = Arc::clone(self);
info!("spawning flush loop");
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
task_mgr::TaskKind::LayerFlushTask,
Some(self.tenant_id),
Some(self.timeline_id),
"layer flush task",
false,
async move {
self_clone.flush_loop(layer_flush_start_rx).await;
let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
assert_eq!(*flush_loop_state, FlushLoopState::Running);
*flush_loop_state = FlushLoopState::Exited;
Ok(()) }
.instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
);
*flush_loop_state = FlushLoopState::Running;
}
pub(super) fn launch_wal_receiver(self: &Arc<Self>) {
if !is_etcd_client_initialized() {
if cfg!(test) {
@@ -1338,95 +1289,53 @@ impl Timeline {
drop(layers);
}
/// Layer flusher task's main loop.
async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
info!("started flush loop");
loop {
tokio::select! {
_ = task_mgr::shutdown_watcher() => {
info!("shutting down layer flush task");
break;
},
_ = layer_flush_start_rx.changed() => {}
/// Flush all frozen layers to disk.
///
/// Only one task at a time can be doing layer-flushing for a
/// given timeline. If 'wait' is true, and another task is
/// currently doing the flushing, this function will wait for it
/// to finish. If 'wait' is false, this function will return
/// immediately instead.
fn flush_frozen_layers(&self, wait: bool) -> anyhow::Result<()> {
let flush_lock_guard = if wait {
self.layer_flush_lock.lock().unwrap()
} else {
match self.layer_flush_lock.try_lock() {
Ok(guard) => guard,
Err(TryLockError::WouldBlock) => return Ok(()),
Err(TryLockError::Poisoned(err)) => panic!("{:?}", err),
}
};
trace!("waking up");
let timer = self.metrics.flush_time_histo.start_timer();
let flush_counter = *layer_flush_start_rx.borrow();
let result = loop {
let layer_to_flush = {
let layers = self.layers.read().unwrap();
layers.frozen_layers.front().cloned()
// drop 'layers' lock to allow concurrent reads and writes
};
if let Some(layer_to_flush) = layer_to_flush {
if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
error!("could not flush frozen layer: {err:?}");
break Err(err);
}
continue;
} else {
break Ok(());
}
};
// Notify any listeners that we're done
let _ = self
.layer_flush_done_tx
.send_replace((flush_counter, result));
timer.stop_and_record();
}
}
async fn flush_frozen_layers_and_wait(&self) -> anyhow::Result<()> {
let mut rx = self.layer_flush_done_tx.subscribe();
// Increment the flush cycle counter and wake up the flush task.
// Remember the new value, so that when we listen for the flush
// to finish, we know when the flush that we initiated has
// finished, instead of some other flush that was started earlier.
let mut my_flush_request = 0;
let flush_loop_state = { *self.flush_loop_state.lock().unwrap() };
if flush_loop_state != FlushLoopState::Running {
anyhow::bail!("cannot flush frozen layers when flush_loop is not running, state is {flush_loop_state:?}")
}
self.layer_flush_start_tx.send_modify(|counter| {
my_flush_request = *counter + 1;
*counter = my_flush_request;
});
let timer = self.metrics.flush_time_histo.start_timer();
loop {
{
let (last_result_counter, last_result) = &*rx.borrow();
if *last_result_counter >= my_flush_request {
if let Err(_err) = last_result {
// We already logged the original error in
// flush_loop. We cannot propagate it to the caller
// here, because it might not be Cloneable
anyhow::bail!(
"Could not flush frozen layer. Request id: {}",
my_flush_request
);
} else {
return Ok(());
}
}
let layers = self.layers.read().unwrap();
if let Some(frozen_layer) = layers.frozen_layers.front() {
let frozen_layer = Arc::clone(frozen_layer);
drop(layers); // to allow concurrent reads and writes
self.flush_frozen_layer(frozen_layer)?;
} else {
// Drop the 'layer_flush_lock' *before* 'layers'. That
// way, if you freeze a layer, and then call
// flush_frozen_layers(false), it is guaranteed that
// if another thread was busy flushing layers and the
// call therefore returns immediately, the other
// thread will have seen the newly-frozen layer and
// will flush that too (assuming no errors).
drop(flush_lock_guard);
drop(layers);
break;
}
trace!("waiting for flush to complete");
rx.changed().await?;
trace!("done")
}
}
fn flush_frozen_layers(&self) {
self.layer_flush_start_tx.send_modify(|val| *val += 1);
timer.stop_and_record();
Ok(())
}
/// Flush one frozen in-memory layer to disk, as a new delta layer.
#[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.filename().display()))]
async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
// As a special case, when we have just imported an image into the repository,
// instead of writing out a L0 delta layer, we directly write out image layer
// files instead. This is possible as long as *all* the data imported into the
@@ -2356,10 +2265,13 @@ impl Timeline {
let last_rec_lsn = data.records.last().unwrap().0;
let img = self
.walredo_mgr
.request_redo(key, request_lsn, base_img, data.records, self.pg_version)
.context("Failed to reconstruct a page image:")?;
let img = self.walredo_mgr.request_redo(
key,
request_lsn,
base_img,
data.records,
self.pg_version,
)?;
if img.len() == page_cache::PAGE_SZ {
let cache = page_cache::get();

View File

@@ -82,7 +82,6 @@ pub struct TenantConf {
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
/// to avoid eager reconnects.
pub max_lsn_wal_lag: NonZeroU64,
pub trace_read_requests: bool,
}
/// Same as TenantConf, but this struct preserves the information about
@@ -106,7 +105,6 @@ pub struct TenantConfOpt {
#[serde(with = "humantime_serde")]
pub lagging_wal_timeout: Option<Duration>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
}
impl TenantConfOpt {
@@ -140,9 +138,6 @@ impl TenantConfOpt {
.lagging_wal_timeout
.unwrap_or(global_conf.lagging_wal_timeout),
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
trace_read_requests: self
.trace_read_requests
.unwrap_or(global_conf.trace_read_requests),
}
}
@@ -212,10 +207,10 @@ impl TenantConf {
.expect("cannot parse default walreceiver lagging wal timeout"),
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.expect("cannot parse default max walreceiver Lsn wal lag"),
trace_read_requests: false,
}
}
#[cfg(test)]
pub fn dummy_conf() -> Self {
TenantConf {
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
@@ -237,7 +232,6 @@ impl TenantConf {
.unwrap(),
max_lsn_wal_lag: NonZeroU64::new(defaults::DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.unwrap(),
trace_read_requests: false,
}
}
}

View File

@@ -241,7 +241,7 @@ pub async fn shutdown_all_tenants() {
let tenant_id = tenant.tenant_id();
debug!("shutdown tenant {tenant_id}");
if let Err(err) = tenant.checkpoint().await {
if let Err(err) = tenant.checkpoint() {
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
}
}

View File

@@ -71,7 +71,9 @@ async fn compaction_loop(tenant_id: TenantId) {
let mut sleep_duration = tenant.get_compaction_period();
if let Err(e) = tenant.compaction_iteration() {
sleep_duration = wait_duration;
error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
error!("Compaction failed, retrying in {:?}: {e:#}", sleep_duration);
#[cfg(feature = "testing")]
std::process::abort();
}
// Sleep
@@ -117,10 +119,12 @@ async fn gc_loop(tenant_id: TenantId) {
let gc_horizon = tenant.get_gc_horizon();
let mut sleep_duration = gc_period;
if gc_horizon > 0 {
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false).await
if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), false)
{
sleep_duration = wait_duration;
error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
error!("Gc failed, retrying in {:?}: {e:#}", sleep_duration);
#[cfg(feature = "testing")]
std::process::abort();
}
}

View File

@@ -1,36 +0,0 @@
use bytes::Bytes;
use std::{
fs::{create_dir_all, File},
io::{BufWriter, Write},
path::PathBuf,
};
pub struct Tracer {
writer: BufWriter<File>,
}
impl Drop for Tracer {
fn drop(&mut self) {
self.flush()
}
}
impl Tracer {
pub fn new(path: PathBuf) -> Self {
let parent = path.parent().expect("failed to parse parent path");
create_dir_all(parent).expect("failed to create trace dir");
let file = File::create(path).expect("failed to create trace file");
Tracer {
writer: BufWriter::new(file),
}
}
pub fn trace(&mut self, msg: &Bytes) {
self.writer.write_all(msg).expect("failed to write trace");
}
pub fn flush(&mut self) {
self.writer.flush().expect("failed to flush trace file");
}
}

View File

@@ -22,10 +22,10 @@ use byteorder::{ByteOrder, LittleEndian};
use bytes::{BufMut, Bytes, BytesMut};
use nix::poll::*;
use serde::Serialize;
use std::fs;
use std::fs::OpenOptions;
use std::io::prelude::*;
use std::io::{Error, ErrorKind};
use std::ops::{Deref, DerefMut};
use std::os::unix::io::AsRawFd;
use std::os::unix::prelude::CommandExt;
use std::path::PathBuf;
@@ -34,7 +34,6 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
use std::sync::Mutex;
use std::time::Duration;
use std::time::Instant;
use std::{fs, io};
use tracing::*;
use utils::crashsafe::path_with_suffix_extension;
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
@@ -45,7 +44,6 @@ use crate::metrics::{
};
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
use crate::repository::Key;
use crate::task_mgr::BACKGROUND_RUNTIME;
use crate::walrecord::NeonWalRecord;
use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
use pageserver_api::reltag::{RelTag, SlruKind};
@@ -101,7 +99,7 @@ pub struct PostgresRedoManager {
tenant_id: TenantId,
conf: &'static PageServerConf,
process: Mutex<Option<PostgresRedoProcess>>,
wal_redo_command: Mutex<libloading::Symbol<unsafe extern fn(cmd: u8, input: * const u8, size: u32, output: * u8)>>,
}
/// Can this request be served by neon redo functions
@@ -203,21 +201,17 @@ impl PostgresRedoManager {
///
pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
// The actual process is launched lazily, on first request.
PostgresRedoManager {
tenant_id,
conf,
process: Mutex::new(None),
}
}
/// Launch process pre-emptively. Should not be needed except for benchmarking.
pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
let inner = self.process.get_mut().unwrap();
if inner.is_none() {
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
*inner = Some(p);
}
Ok(())
unsafe {
let lib = libloading::Library::new("/home/knizhnik/zenith/pg_install/build/v14/src/backend/postgres.so").unwrap();
let main: libloading::Symbol<unsafe extern fn(arg0: * const u8, arg1: * const u8, arg2: * const u8, arg3: * const u8) -> u32> = lib.get(b"man").unwrap();
main(b"postgres".as_ptr(), b"--wal-redo".as_ptr(), std::ptr::null());
let wal_redo_command = lib.get(b"wal_redo_command").unwrap();
PostgresRedoManager {
tenant_id,
conf,
wal_redo_command: Mutex::new(wal_redo_command),
}
}
}
///
@@ -241,7 +235,7 @@ impl PostgresRedoManager {
// launch the WAL redo process on first use
if process_guard.is_none() {
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
*process_guard = Some(p);
}
let process = process_guard.as_mut().unwrap();
@@ -591,8 +585,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
/// Handle to the Postgres WAL redo process
///
struct PostgresRedoProcess {
tenant_id: TenantId,
child: NoLeakChild,
child: Child,
stdin: ChildStdin,
stdout: ChildStdout,
stderr: ChildStderr,
@@ -602,17 +595,16 @@ impl PostgresRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
fn launch(
conf: &PageServerConf,
tenant_id: TenantId,
tenant_id: &TenantId,
pg_version: u32,
) -> Result<PostgresRedoProcess, Error> {
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
// just create one with constant name. That fails if you try to launch more than
// one WAL redo manager concurrently.
let datadir = path_with_suffix_extension(
conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
conf.tenant_path(tenant_id).join("wal-redo-datadir"),
TEMP_FILE_SUFFIX,
);
@@ -667,7 +659,7 @@ impl PostgresRedoProcess {
}
// Start postgres itself
let child = Command::new(pg_bin_dir_path.join("postgres"))
let mut child = Command::new(pg_bin_dir_path.join("postgres"))
.arg("--wal-redo")
.stdin(Stdio::piped())
.stderr(Stdio::piped())
@@ -686,7 +678,7 @@ impl PostgresRedoProcess {
// as close-on-exec by default, but that's not enough, since we use
// libraries that directly call libc open without setting that flag.
.close_fds()
.spawn_no_leak_child()
.spawn()
.map_err(|e| {
Error::new(
e.kind(),
@@ -694,33 +686,20 @@ impl PostgresRedoProcess {
)
})?;
let mut child = scopeguard::guard(child, |child| {
error!("killing wal-redo-postgres process due to a problem during launch");
child.kill_and_wait();
});
info!(
"launched WAL redo postgres process on {}",
datadir.display()
);
let stdin = child.stdin.take().unwrap();
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
if let Err(e) = &res {
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
}
res
}};
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
set_nonblock_or_log_err!(stderr)?;
// all fallible operations post-spawn are complete, so get rid of the guard
let child = scopeguard::ScopeGuard::into_inner(child);
set_nonblock(stdin.as_raw_fd())?;
set_nonblock(stdout.as_raw_fd())?;
set_nonblock(stderr.as_raw_fd())?;
Ok(PostgresRedoProcess {
tenant_id,
child,
stdin,
stdout,
@@ -728,16 +707,18 @@ impl PostgresRedoProcess {
})
}
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
fn kill(self) {
self.child.kill_and_wait();
fn kill(mut self) {
let _ = self.child.kill();
if let Ok(exit_status) = self.child.wait() {
error!("wal-redo-postgres exited with code {}", exit_status);
}
drop(self);
}
//
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
fn apply_wal_records(
&mut self,
tag: BufferTag,
@@ -750,14 +731,11 @@ impl PostgresRedoProcess {
// This could be problematic if there are millions of records to replay,
// but in practice the number of records is usually so small that it doesn't
// matter, and it's better to keep this code simple.
//
// Most requests start with a before-image with BLCKSZ bytes, followed by
// by some other WAL records. Start with a buffer that can hold that
// comfortably.
let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
build_begin_redo_for_block_msg(tag, &mut writebuf);
let wal_redo_command = self.wal_redo_command.lock().unwrap();
let tag_data = tag.ser_into(buf).unwrap();
wal_redo_command(b'B', tag_data.as_ptr(), tag_data.len() as u32, std::ptr::null());
if let Some(img) = base_img {
build_push_page_msg(tag, &img, &mut writebuf);
wal_redo_command(b'P', img.as_ptr(), img.len() as u32, std::ptr::null());
}
for (lsn, rec) in records.iter() {
if let NeonWalRecord::Postgres {
@@ -765,7 +743,10 @@ impl PostgresRedoProcess {
rec: postgres_rec,
} = rec
{
build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
let mut buf = Vec::new();
buf.put_u64(*lsn.0);
buf.put(postgres_rec);
wal_redo_command(b'A', buf.get_ptr(), buf.len() as u32, std::ptr::null());
} else {
return Err(Error::new(
ErrorKind::Other,
@@ -773,190 +754,10 @@ impl PostgresRedoProcess {
));
}
}
build_get_page_msg(tag, &mut writebuf);
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
// The input is now in 'writebuf'. Do a blind write first, writing as much as
// we can, before calling poll(). That skips one call to poll() if the stdin is
// already available for writing, which it almost certainly is because the
// process is idle.
let mut nwrite = self.stdin.write(&writebuf)?;
// We expect the WAL redo process to respond with an 8k page image. We read it
// into this buffer.
let mut resultbuf = vec![0; BLCKSZ.into()];
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
// Prepare for calling poll()
let mut pollfds = [
PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
];
// We do three things simultaneously: send the old base image and WAL records to
// the child process's stdin, read the result from child's stdout, and forward any logging
// information that the child writes to its stderr to the page server's log.
while nresult < BLCKSZ.into() {
// If we have more data to write, wake up if 'stdin' becomes writeable or
// we have data to read. Otherwise only wake up if there's data to read.
let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
let n = loop {
match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) {
Err(e) if e == nix::errno::Errno::EINTR => continue,
res => break res,
}
}?;
if n == 0 {
return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
}
// If we have some messages in stderr, forward them to the log.
let err_revents = pollfds[1].revents().unwrap();
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
let mut errbuf: [u8; 16384] = [0; 16384];
let n = self.stderr.read(&mut errbuf)?;
// The message might not be split correctly into lines here. But this is
// good enough, the important thing is to get the message to the log.
if n > 0 {
error!(
"wal-redo-postgres: {}",
String::from_utf8_lossy(&errbuf[0..n])
);
// To make sure we capture all log from the process if it fails, keep
// reading from the stderr, before checking the stdout.
continue;
}
} else if err_revents.contains(PollFlags::POLLHUP) {
return Err(Error::new(
ErrorKind::BrokenPipe,
"WAL redo process closed its stderr unexpectedly",
));
}
// If we have more data to write and 'stdin' is writeable, do write.
if nwrite < writebuf.len() {
let in_revents = pollfds[2].revents().unwrap();
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
nwrite += self.stdin.write(&writebuf[nwrite..])?;
} else if in_revents.contains(PollFlags::POLLHUP) {
// We still have more data to write, but the process closed the pipe.
return Err(Error::new(
ErrorKind::BrokenPipe,
"WAL redo process closed its stdin unexpectedly",
));
}
}
// If we have some data in stdout, read it to the result buffer.
let out_revents = pollfds[0].revents().unwrap();
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
nresult += self.stdout.read(&mut resultbuf[nresult..])?;
} else if out_revents.contains(PollFlags::POLLHUP) {
return Err(Error::new(
ErrorKind::BrokenPipe,
"WAL redo process closed its stdout unexpectedly",
));
}
}
Ok(Bytes::from(resultbuf))
}
}
/// Wrapper type around `std::process::Child` which guarantees that the child
/// will be killed and waited-for by this process before being dropped.
struct NoLeakChild {
child: Option<Child>,
}
impl Deref for NoLeakChild {
type Target = Child;
fn deref(&self) -> &Self::Target {
self.child.as_ref().expect("must not use from drop")
}
}
impl DerefMut for NoLeakChild {
fn deref_mut(&mut self) -> &mut Self::Target {
self.child.as_mut().expect("must not use from drop")
}
}
impl NoLeakChild {
fn spawn(command: &mut Command) -> io::Result<Self> {
let child = command.spawn()?;
Ok(NoLeakChild { child: Some(child) })
}
fn kill_and_wait(mut self) {
let child = match self.child.take() {
Some(child) => child,
None => return,
};
Self::kill_and_wait_impl(child);
}
#[instrument(skip_all, fields(pid=child.id()))]
fn kill_and_wait_impl(mut child: Child) {
let res = child.kill();
if let Err(e) = res {
// This branch is very unlikely because:
// - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
// - This is the only place that calls .kill()
// - We consume `self`, so, .kill() can't be called twice.
// - If the process exited by itself or was killed by someone else,
// .kill() will still succeed because we haven't wait()'ed yet.
//
// So, if we arrive here, we have really no idea what happened,
// whether the PID stored in self.child is still valid, etc.
// If this function were fallible, we'd return an error, but
// since it isn't, all we can do is log an error and proceed
// with the wait().
error!(error = %e, "failed to SIGKILL; subsequent wait() might fail or wait for wrong process");
}
match child.wait() {
Ok(exit_status) => {
// log at error level since .kill() is something we only do on errors ATM
error!(exit_status = %exit_status, "wait successful");
}
Err(e) => {
error!(error = %e, "wait error; might leak the child process; it will show as zombie (defunct)");
}
}
}
}
impl Drop for NoLeakChild {
fn drop(&mut self) {
let child = match self.child.take() {
Some(child) => child,
None => return,
};
// Offload the kill+wait of the child process into the background.
// If someone stops the runtime, we'll leak the child process.
// We can ignore that case because we only stop the runtime on pageserver exit.
BACKGROUND_RUNTIME.spawn(async move {
tokio::task::spawn_blocking(move || {
Self::kill_and_wait_impl(child);
})
.await
});
}
}
trait NoLeakChildCommandExt {
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild>;
}
impl NoLeakChildCommandExt for Command {
fn spawn_no_leak_child(&mut self) -> io::Result<NoLeakChild> {
NoLeakChild::spawn(self)
let mut page = Vec::new();
page.resize(BLCKSZ as usize, 0);
wal_redo_command(b'G', tag_data.as_ptr(), tag_data.len() as u32, page.as_mut_ptr());
Ok(Bytes::from(page))
}
}

View File

@@ -32,25 +32,16 @@
#define PageStoreTrace DEBUG5
#define NEON_TAG "[NEON_SMGR] "
#define neon_log(tag, fmt, ...) ereport(tag, \
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true)))
bool connected = false;
PGconn *pageserver_conn = NULL;
/*
* WaitEventSet containing:
* - WL_SOCKET_READABLE on pageserver_conn,
* - WL_LATCH_SET on MyLatch, and
* - WL_EXIT_ON_PM_DEATH.
*/
WaitEventSet *pageserver_conn_wes = NULL;
char *page_server_connstring_raw;
int n_unflushed_requests = 0;
int flush_every_n_requests = 8;
int readahead_buffer_size = 128;
static void pageserver_flush(void);
static void
pageserver_connect()
{
@@ -67,7 +58,6 @@ pageserver_connect()
PQfinish(pageserver_conn);
pageserver_conn = NULL;
ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "could not establish connection to pageserver"),
@@ -83,25 +73,22 @@ pageserver_connect()
neon_log(ERROR, "could not send pagestream command to pageserver");
}
pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
MyLatch, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
NULL, NULL);
AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
while (PQisBusy(pageserver_conn))
{
WaitEvent event;
int wc;
/* Sleep until there's something to do */
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
wc = WaitLatchOrSocket(MyLatch,
WL_LATCH_SET | WL_SOCKET_READABLE |
WL_EXIT_ON_PM_DEATH,
PQsocket(pageserver_conn),
-1L, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (event.events & WL_SOCKET_READABLE)
if (wc & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(pageserver_conn))
{
@@ -109,7 +96,6 @@ pageserver_connect()
PQfinish(pageserver_conn);
pageserver_conn = NULL;
FreeWaitEventSet(pageserver_conn_wes);
neon_log(ERROR, "could not complete handshake with pageserver: %s",
msg);
@@ -126,29 +112,33 @@ pageserver_connect()
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
*/
static int
call_PQgetCopyData(char **buffer)
call_PQgetCopyData(PGconn *conn, char **buffer)
{
int ret;
retry:
ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );
ret = PQgetCopyData(conn, buffer, 1 /* async */ );
if (ret == 0)
{
WaitEvent event;
int wc;
/* Sleep until there's something to do */
(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
wc = WaitLatchOrSocket(MyLatch,
WL_LATCH_SET | WL_SOCKET_READABLE |
WL_EXIT_ON_PM_DEATH,
PQsocket(conn),
-1L, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (event.events & WL_SOCKET_READABLE)
if (wc & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(pageserver_conn))
if (!PQconsumeInput(conn))
neon_log(ERROR, "could not get response from pageserver: %s",
PQerrorMessage(pageserver_conn));
PQerrorMessage(conn));
}
goto retry;
@@ -174,11 +164,7 @@ pageserver_disconnect(void)
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
prefetch_on_ps_disconnect();
}
if (pageserver_conn_wes != NULL)
FreeWaitEventSet(pageserver_conn_wes);
}
static void
@@ -188,7 +174,11 @@ pageserver_send(NeonRequest * request)
/* If the connection was lost for some reason, reconnect */
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
pageserver_disconnect();
{
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
}
if (!connected)
pageserver_connect();
@@ -212,11 +202,6 @@ pageserver_send(NeonRequest * request)
}
pfree(req_buff.data);
n_unflushed_requests++;
if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
pageserver_flush();
if (message_level_is_interesting(PageStoreTrace))
{
char *msg = nm_to_string((NeonMessage *) request);
@@ -232,22 +217,16 @@ pageserver_receive(void)
StringInfoData resp_buff;
NeonResponse *resp;
if (!connected)
return NULL;
PG_TRY();
{
/* read response */
resp_buff.len = call_PQgetCopyData(&resp_buff.data);
resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
resp_buff.cursor = 0;
if (resp_buff.len < 0)
{
if (resp_buff.len == -1)
{
pageserver_disconnect();
return NULL;
}
neon_log(ERROR, "end of COPY");
else if (resp_buff.len == -2)
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
}
@@ -276,21 +255,25 @@ pageserver_receive(void)
static void
pageserver_flush(void)
{
if (!connected)
{
neon_log(WARNING, "Tried to flush while disconnected");
}
else if (PQflush(pageserver_conn))
if (PQflush(pageserver_conn))
{
char *msg = PQerrorMessage(pageserver_conn);
pageserver_disconnect();
neon_log(ERROR, "failed to flush page requests: %s", msg);
}
n_unflushed_requests = 0;
}
static NeonResponse *
pageserver_call(NeonRequest * request)
{
pageserver_send(request);
pageserver_flush();
return pageserver_receive();
}
page_server_api api = {
.request = pageserver_call,
.send = pageserver_send,
.flush = pageserver_flush,
.receive = pageserver_receive
@@ -444,27 +427,6 @@ pg_init_libpagestore(void)
PGC_SIGHUP,
GUC_UNIT_MB,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.flush_output_after",
"Flush the output buffer after every N unflushed requests",
NULL,
&flush_every_n_requests,
8, -1, INT_MAX,
PGC_USERSET,
0, /* no flags required */
NULL, NULL, NULL);
DefineCustomIntVariable("neon.readahead_buffer_size",
"number of prefetches to buffer",
"This buffer is used to store prefetched data; so "
"it is important that this buffer is at least as "
"large as the configured value of all tablespaces' "
"effective_io_concurrency and maintenance_io_concurrency, "
"your sessions' values of these, and the value for "
"seqscan_prefetch_buffers.",
&readahead_buffer_size,
128, 16, 1024,
PGC_USERSET,
0, /* no flags required */
NULL, (GucIntAssignHook) &readahead_buffer_resize, NULL);
relsize_hash_init();

View File

@@ -49,11 +49,6 @@ typedef struct
#define messageTag(m) (((const NeonMessage *)(m))->tag)
#define NEON_TAG "[NEON_SMGR] "
#define neon_log(tag, fmt, ...) ereport(tag, \
(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
errhidestmt(true), errhidecontext(true)))
/*
* supertype of all the Neon*Request structs below
*
@@ -120,8 +115,6 @@ typedef struct
char page[FLEXIBLE_ARRAY_MEMBER];
} NeonGetPageResponse;
#define PS_GETPAGERESPONSE_SIZE (MAXALIGN(offsetof(NeonGetPageResponse, page) + BLCKSZ))
typedef struct
{
NeonMessageTag tag;
@@ -145,20 +138,15 @@ extern char *nm_to_string(NeonMessage * msg);
typedef struct
{
NeonResponse *(*request) (NeonRequest * request);
void (*send) (NeonRequest * request);
NeonResponse *(*receive) (void);
void (*flush) (void);
} page_server_api;
extern void prefetch_on_ps_disconnect(void);
extern page_server_api * page_server;
extern char *page_server_connstring;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern bool seqscan_prefetch_enabled;
extern int seqscan_prefetch_distance;
extern char *neon_timeline;
extern char *neon_tenant;
extern bool wal_redo;
@@ -166,7 +154,6 @@ extern int32 max_cluster_size;
extern const f_smgr *smgr_neon(BackendId backend, RelFileNode rnode);
extern void smgr_init_neon(void);
extern void readahead_buffer_resize(int newsize, void *extra);
/* Neon storage manager functionality */
@@ -180,6 +167,7 @@ extern void neon_extend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern bool neon_prefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void neon_reset_prefetch(SMgrRelation reln);
extern void neon_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);

View File

@@ -49,20 +49,22 @@
#include "access/xlog.h"
#include "access/xloginsert.h"
#include "access/xlog_internal.h"
#include "access/xlogdefs.h"
#include "catalog/pg_class.h"
#include "common/hashfn.h"
#include "pagestore_client.h"
#include "pagestore_client.h"
#include "storage/smgr.h"
#include "access/xlogdefs.h"
#include "postmaster/interrupt.h"
#include "postmaster/autovacuum.h"
#include "replication/walsender.h"
#include "storage/bufmgr.h"
#include "storage/relfilenode.h"
#include "storage/buf_internals.h"
#include "storage/smgr.h"
#include "storage/md.h"
#include "fmgr.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "catalog/pg_tablespace_d.h"
#include "postmaster/autovacuum.h"
#if PG_VERSION_NUM >= 150000
#include "access/xlogutils.h"
@@ -111,660 +113,48 @@ typedef enum
static SMgrRelation unlogged_build_rel = NULL;
static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
/*
* Prefetch implementation:
*
* Prefetch is performed locally by each backend.
*
* There can be up to readahead_buffer_size active IO requests registered at
* any time. Requests using smgr_prefetch are sent to the pageserver, but we
* don't wait on the response. Requests using smgr_read are either read from
* the buffer, or (if that's not possible) we wait on the response to arrive -
* this also will allow us to receive other prefetched pages.
* Each request is immediately written to the output buffer of the pageserver
* connection, but may not be flushed if smgr_prefetch is used: pageserver
* flushes sent requests on manual flush, or every neon.flush_output_after
* unflushed requests; which is not necessarily always and all the time.
*
* Once we have received a response, this value will be stored in the response
* buffer, indexed in a hash table. This allows us to retain our buffered
* prefetch responses even when we have cache misses.
*
* Reading of prefetch responses is delayed until them are actually needed
* (smgr_read). In case of prefetch miss or any other SMGR request other than
* smgr_read, all prefetch responses in the pipeline will need to be read from
* the connection; the responses are stored for later use.
*
* NOTE: The current implementation of the prefetch system implements a ring
* buffer of up to readahead_buffer_size requests. If there are more _read and
* _prefetch requests between the initial _prefetch and the _read of a buffer,
* the prefetch request will have been dropped from this prefetch buffer, and
* your prefetch was wasted.
* There can be up to MAX_PREFETCH_REQUESTS registered using smgr_prefetch
* before smgr_read. All this requests are appended to primary smgr_read request.
* It is assumed that pages will be requested in prefetch order.
* Reading of prefetch responses is delayed until them are actually needed (smgr_read).
* It make it possible to parallelize processing and receiving of prefetched pages.
* In case of prefetch miss or any other SMGR request other than smgr_read,
* all prefetch responses has to be consumed.
*/
/*
* State machine:
*
* not in hash : in hash
* :
* UNUSED ------> REQUESTED --> RECEIVED
* ^ : | |
* | : v |
* | : TAG_UNUSED |
* | : | |
* +----------------+------------+
* :
*/
typedef enum PrefetchStatus {
PRFS_UNUSED = 0, /* unused slot */
PRFS_REQUESTED, /* request was written to the sendbuffer to PS, but not
* necessarily flushed.
* all fields except response valid */
PRFS_RECEIVED, /* all fields valid */
PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still valid */
} PrefetchStatus;
#define MAX_PREFETCH_REQUESTS 128
typedef struct PrefetchRequest {
BufferTag buftag; /* must be first entry in the struct */
XLogRecPtr effective_request_lsn;
NeonResponse *response; /* may be null */
PrefetchStatus status;
uint64 my_ring_index;
} PrefetchRequest;
BufferTag prefetch_requests[MAX_PREFETCH_REQUESTS];
BufferTag prefetch_responses[MAX_PREFETCH_REQUESTS];
int n_prefetch_requests;
int n_prefetch_responses;
int n_prefetched_buffers;
int n_prefetch_hits;
int n_prefetch_misses;
XLogRecPtr prefetch_lsn;
/* prefetch buffer lookup hash table */
typedef struct PrfHashEntry {
PrefetchRequest *slot;
uint32 status;
uint32 hash;
} PrfHashEntry;
#define SH_PREFIX prfh
#define SH_ELEMENT_TYPE PrfHashEntry
#define SH_KEY_TYPE PrefetchRequest *
#define SH_KEY slot
#define SH_STORE_HASH
#define SH_GET_HASH(tb, a) ((a)->hash)
#define SH_HASH_KEY(tb, key) hash_bytes( \
((const unsigned char *) &(key)->buftag), \
sizeof(BufferTag) \
)
#define SH_EQUAL(tb, a, b) (BUFFERTAGS_EQUAL((a)->buftag, (b)->buftag))
#define SH_SCOPE static inline
#define SH_DEFINE
#define SH_DECLARE
#include "lib/simplehash.h"
/*
* PrefetchState maintains the state of (prefetch) getPage@LSN requests.
* It maintains a (ring) buffer of in-flight requests and responses.
*
* We maintain several indexes into the ring buffer:
* ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
*
* ring_unused points to the first unused slot of the buffer
* ring_receive is the next request that is to be received
* ring_last is the oldest received entry in the buffer
*
* Apart from being an entry in the ring buffer of prefetch requests, each
* PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
*/
typedef struct PrefetchState {
MemoryContext bufctx; /* context for prf_buffer[].response allocations */
MemoryContext errctx; /* context for prf_buffer[].response allocations */
MemoryContext hashctx; /* context for prf_buffer */
/* buffer indexes */
uint64 ring_unused; /* first unused slot */
uint64 ring_flush; /* next request to flush */
uint64 ring_receive; /* next slot that is to receive a response */
uint64 ring_last; /* min slot with a response value */
/* metrics / statistics */
int n_responses_buffered; /* count of PS responses not yet in buffers */
int n_requests_inflight; /* count of PS requests considered in flight */
int n_unused; /* count of buffers < unused, > last, that are also unused */
/* the buffers */
prfh_hash *prf_hash;
PrefetchRequest prf_buffer[]; /* prefetch buffers */
} PrefetchState;
PrefetchState *MyPState;
#define GetPrfSlot(ring_index) ( \
( \
AssertMacro((ring_index) < MyPState->ring_unused && \
(ring_index) >= MyPState->ring_last), \
&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
) \
)
int n_prefetch_hits = 0;
int n_prefetch_misses = 0;
int n_prefetch_missed_caches = 0;
int n_prefetch_dupes = 0;
XLogRecPtr prefetch_lsn = 0;
static void consume_prefetch_responses(void);
static uint64 prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn);
static bool prefetch_read(PrefetchRequest *slot);
static void prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn);
static bool prefetch_wait_for(uint64 ring_index);
static void prefetch_cleanup(void);
static inline void prefetch_set_unused(uint64 ring_index);
static XLogRecPtr neon_get_request_lsn(bool *latest, RelFileNode rnode,
ForkNumber forknum, BlockNumber blkno);
void
readahead_buffer_resize(int newsize, void *extra)
{
uint64 end,
nfree = newsize;
PrefetchState *newPState;
Size newprfs_size = offsetof(PrefetchState, prf_buffer) + (
sizeof(PrefetchRequest) * readahead_buffer_size
);
/* don't try to re-initialize if we haven't initialized yet */
if (MyPState == NULL)
return;
/*
* Make sure that we don't lose track of active prefetch requests by
* ensuring we have received all but the last n requests (n = newsize).
*/
if (MyPState->n_requests_inflight > newsize)
prefetch_wait_for(MyPState->ring_unused - newsize);
/* construct the new PrefetchState, and copy over the memory contexts */
newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
newPState->bufctx = MyPState->bufctx;
newPState->errctx = MyPState->errctx;
newPState->hashctx = MyPState->hashctx;
newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
newPState->n_unused = newsize;
newPState->n_requests_inflight = 0;
newPState->n_responses_buffered = 0;
newPState->ring_last = newsize;
newPState->ring_unused = newsize;
newPState->ring_receive = newsize;
newPState->ring_flush = newsize;
/*
* Copy over the prefetches.
*
* We populate the prefetch array from the end; to retain the most recent
* prefetches, but this has the benefit of only needing to do one iteration
* on the dataset, and trivial compaction.
*/
for (end = MyPState->ring_unused - 1;
end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
end -= 1)
{
PrefetchRequest *slot = GetPrfSlot(end);
PrefetchRequest *newslot;
bool found;
if (slot->status == PRFS_UNUSED)
continue;
nfree -= 1;
newslot = &newPState->prf_buffer[nfree];
*newslot = *slot;
newslot->my_ring_index = nfree;
prfh_insert(newPState->prf_hash, newslot, &found);
Assert(!found);
switch (newslot->status)
{
case PRFS_UNUSED:
pg_unreachable();
case PRFS_REQUESTED:
newPState->n_requests_inflight += 1;
newPState->ring_receive -= 1;
newPState->ring_last -= 1;
break;
case PRFS_RECEIVED:
newPState->n_responses_buffered += 1;
newPState->ring_last -= 1;
break;
case PRFS_TAG_REMAINS:
newPState->ring_last -= 1;
break;
}
newPState->n_unused -= 1;
}
for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
{
prefetch_set_unused(end);
}
prfh_destroy(MyPState->prf_hash);
pfree(MyPState);
MyPState = newPState;
}
/*
* Make sure that there are no responses still in the buffer.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static void
consume_prefetch_responses(void)
{
if (MyPState->ring_receive < MyPState->ring_unused)
prefetch_wait_for(MyPState->ring_unused - 1);
}
static void
prefetch_cleanup(void)
{
uint64 ring_index;
PrefetchRequest *slot;
while (MyPState->ring_last < MyPState->ring_receive) {
ring_index = MyPState->ring_last;
slot = GetPrfSlot(ring_index);
if (slot->status == PRFS_UNUSED)
MyPState->ring_last += 1;
else
break;
}
}
/*
* Wait for slot of ring_index to have received its response.
* The caller is responsible for making sure the request buffer is flushed.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static bool
prefetch_wait_for(uint64 ring_index)
{
PrefetchRequest *entry;
if (MyPState->ring_flush <= ring_index &&
MyPState->ring_unused > MyPState->ring_flush)
for (int i = n_prefetched_buffers; i < n_prefetch_responses; i++)
{
page_server->flush();
MyPState->ring_flush = MyPState->ring_unused;
NeonResponse *resp = page_server->receive();
pfree(resp);
}
Assert(MyPState->ring_unused > ring_index);
while (MyPState->ring_receive <= ring_index)
{
entry = GetPrfSlot(MyPState->ring_receive);
Assert(entry->status == PRFS_REQUESTED);
if (!prefetch_read(entry))
return false;
}
return true;
}
/*
* Read the response of a prefetch request into its slot.
*
* The caller is responsible for making sure that the request for this buffer
* was flushed to the PageServer.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static bool
prefetch_read(PrefetchRequest *slot)
{
NeonResponse *response;
MemoryContext old;
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_receive);
old = MemoryContextSwitchTo(MyPState->errctx);
response = (NeonResponse *) page_server->receive();
MemoryContextSwitchTo(old);
if (response)
{
/* update prefetch state */
MyPState->n_responses_buffered += 1;
MyPState->n_requests_inflight -= 1;
MyPState->ring_receive += 1;
/* update slot state */
slot->status = PRFS_RECEIVED;
slot->response = response;
return true;
}
else
{
return false;
}
}
/*
* Disconnect hook - drop prefetches when the connection drops
*
* If we don't remove the failed prefetches, we'd be serving incorrect
* data to the smgr.
*/
void
prefetch_on_ps_disconnect(void)
{
MyPState->ring_flush = MyPState->ring_unused;
while (MyPState->ring_receive < MyPState->ring_unused)
{
PrefetchRequest *slot;
uint64 ring_index = MyPState->ring_receive;
slot = GetPrfSlot(ring_index);
Assert(slot->status == PRFS_REQUESTED);
Assert(slot->my_ring_index == ring_index);
/* clean up the request */
slot->status = PRFS_TAG_REMAINS;
MyPState->n_requests_inflight -= 1;
MyPState->ring_receive += 1;
prefetch_set_unused(ring_index);
}
}
/*
* prefetch_set_unused() - clear a received prefetch slot
*
* The slot at ring_index must be a current member of the ring buffer,
* and may not be in the PRFS_REQUESTED state.
*
* NOTE: this function will update MyPState->pfs_hash; which invalidates any
* active pointers into the hash table.
*/
static inline void
prefetch_set_unused(uint64 ring_index)
{
PrefetchRequest *slot = GetPrfSlot(ring_index);
if (ring_index < MyPState->ring_last)
return; /* Should already be unused */
Assert(MyPState->ring_unused > ring_index);
if (slot->status == PRFS_UNUSED)
return;
Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
if (slot->status == PRFS_RECEIVED)
{
pfree(slot->response);
slot->response = NULL;
MyPState->n_responses_buffered -= 1;
MyPState->n_unused += 1;
}
else
{
Assert(slot->response == NULL);
}
prfh_delete(MyPState->prf_hash, slot);
/* clear all fields */
MemSet(slot, 0, sizeof(PrefetchRequest));
slot->status = PRFS_UNUSED;
/* run cleanup if we're holding back ring_last */
if (MyPState->ring_last == ring_index)
prefetch_cleanup();
}
static void
prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force_lsn)
{
bool found;
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
.req.latest = false,
.req.lsn = 0,
.rnode = slot->buftag.rnode,
.forknum = slot->buftag.forkNum,
.blkno = slot->buftag.blockNum,
};
if (force_lsn && force_latest)
{
request.req.lsn = *force_lsn;
request.req.latest = *force_latest;
slot->effective_request_lsn = *force_lsn;
}
else
{
XLogRecPtr lsn = neon_get_request_lsn(
&request.req.latest,
slot->buftag.rnode,
slot->buftag.forkNum,
slot->buftag.blockNum
);
/*
* Note: effective_request_lsn is potentially higher than the requested
* LSN, but still correct:
*
* We know there are no changes between the actual requested LSN and
* the value of effective_request_lsn: If there were, the page would
* have been in cache and evicted between those LSN values, which
* then would have had to result in a larger request LSN for this page.
*
* It is possible that a concurrent backend loads the page, modifies
* it and then evicts it again, but the LSN of that eviction cannot be
* smaller than the current WAL insert/redo pointer, which is already
* larger than this prefetch_lsn. So in any case, that would
* invalidate this cache.
*
* The best LSN to use for effective_request_lsn would be
* XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
*/
request.req.lsn = lsn;
prefetch_lsn = Max(prefetch_lsn, lsn);
slot->effective_request_lsn = prefetch_lsn;
}
Assert(slot->response == NULL);
Assert(slot->my_ring_index == MyPState->ring_unused);
page_server->send((NeonRequest *) &request);
/* update prefetch state */
MyPState->n_requests_inflight += 1;
MyPState->n_unused -= 1;
MyPState->ring_unused += 1;
/* update slot state */
slot->status = PRFS_REQUESTED;
prfh_insert(MyPState->prf_hash, slot, &found);
Assert(!found);
}
/*
* prefetch_register_buffer() - register and prefetch buffer
*
* Register that we may want the contents of BufferTag in the near future.
*
* If force_latest and force_lsn are not NULL, those values are sent to the
* pageserver. If they are NULL, we utilize the lastWrittenLsn -infrastructure
* to fill in these values manually.
*
* NOTE: this function may indirectly update MyPState->pfs_hash; which
* invalidates any active pointers into the hash table.
*/
static uint64
prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_lsn)
{
uint64 ring_index;
PrefetchRequest req;
PrefetchRequest *slot;
PrfHashEntry *entry;
/* use an intermediate PrefetchRequest struct to ensure correct alignment */
req.buftag = tag;
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);
if (entry != NULL)
{
slot = entry->slot;
ring_index = slot->my_ring_index;
Assert(slot == GetPrfSlot(ring_index));
Assert(slot->status != PRFS_UNUSED);
Assert(MyPState->ring_last <= ring_index &&
ring_index < MyPState->ring_unused);
Assert(BUFFERTAGS_EQUAL(slot->buftag, tag));
/*
* If we want a specific lsn, we do not accept requests that were made
* with a potentially different LSN.
*/
if (force_latest && force_lsn)
{
/* if we want the latest version, any effective_request_lsn < request lsn is OK */
if (*force_latest)
{
if (*force_lsn > slot->effective_request_lsn)
{
prefetch_wait_for(ring_index);
prefetch_set_unused(ring_index);
entry = NULL;
}
}
/* if we don't want the latest version, only accept requests with the exact same LSN */
else
{
if (*force_lsn != slot->effective_request_lsn)
{
prefetch_wait_for(ring_index);
prefetch_set_unused(ring_index);
entry = NULL;
}
}
}
/*
* We received a prefetch for a page that was recently read and
* removed from the buffers. Remove that request from the buffers.
*/
else if (slot->status == PRFS_TAG_REMAINS)
{
prefetch_set_unused(ring_index);
entry = NULL;
}
else
{
/* The buffered request is good enough, return that index */
n_prefetch_dupes++;
return ring_index;
}
}
/*
* If the prefetch queue is full, we need to make room by clearing the
* oldest slot. If the oldest slot holds a buffer that was already
* received, we can just throw it away; we fetched the page unnecessarily
* in that case. If the oldest slot holds a request that we haven't
* received a response for yet, we have to wait for the response to that
* before we can continue. We might not have even flushed the request to
* the pageserver yet, it might be just sitting in the output buffer. In
* that case, we flush it and wait for the response. (We could decide not
* to send it, but it's hard to abort when the request is already in the
* output buffer, and 'not sending' a prefetch request kind of goes
* against the principles of prefetching)
*/
if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
{
uint64 cleanup_index = MyPState->ring_last;
slot = GetPrfSlot(cleanup_index);
Assert(slot->status != PRFS_UNUSED);
/* We have the slot for ring_last, so that must still be in progress */
switch (slot->status)
{
case PRFS_REQUESTED:
Assert(MyPState->ring_receive == cleanup_index);
prefetch_wait_for(cleanup_index);
prefetch_set_unused(cleanup_index);
break;
case PRFS_RECEIVED:
case PRFS_TAG_REMAINS:
prefetch_set_unused(cleanup_index);
break;
default:
pg_unreachable();
}
}
/*
* The next buffer pointed to by `ring_unused` is now definitely empty,
* so we can insert the new request to it.
*/
ring_index = MyPState->ring_unused;
slot = &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)];
Assert(MyPState->ring_last <= ring_index);
Assert(slot->status == PRFS_UNUSED);
/*
* We must update the slot data before insertion, because the hash
* function reads the buffer tag from the slot.
*/
slot->buftag = tag;
slot->my_ring_index = ring_index;
prefetch_do_request(slot, force_latest, force_lsn);
Assert(slot->status == PRFS_REQUESTED);
Assert(MyPState->ring_last <= ring_index &&
ring_index < MyPState->ring_unused);
if (flush_every_n_requests > 0 &&
MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
{
page_server->flush();
MyPState->ring_flush = MyPState->ring_unused;
}
return ring_index;
n_prefetched_buffers = 0;
n_prefetch_responses = 0;
}
static NeonResponse *
page_server_request(void const *req)
{
NeonResponse* resp;
do {
page_server->send((NeonRequest *) req);
page_server->flush();
MyPState->ring_flush = MyPState->ring_unused;
consume_prefetch_responses();
resp = page_server->receive();
} while (resp == NULL);
return resp;
consume_prefetch_responses();
return page_server->request((NeonRequest *) req);
}
@@ -878,15 +268,12 @@ nm_unpack_response(StringInfo s)
case T_NeonGetPageResponse:
{
NeonGetPageResponse *msg_resp;
NeonGetPageResponse *msg_resp = palloc0(offsetof(NeonGetPageResponse, page) + BLCKSZ);
msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
msg_resp->tag = tag;
/* XXX: should be varlena */
memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
pq_getmsgend(s);
Assert(msg_resp->tag == T_NeonGetPageResponse);
resp = (NeonResponse *) msg_resp;
break;
@@ -1230,33 +617,7 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, ch
void
neon_init(void)
{
Size prfs_size;
if (MyPState != NULL)
return;
prfs_size = offsetof(PrefetchState, prf_buffer) + (
sizeof(PrefetchRequest) * readahead_buffer_size
);
MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
MyPState->n_unused = readahead_buffer_size;
MyPState->bufctx = SlabContextCreate(TopMemoryContext,
"NeonSMGR/prefetch",
SLAB_DEFAULT_BLOCK_SIZE * 17,
PS_GETPAGERESPONSE_SIZE);
MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
"NeonSMGR/errors",
ALLOCSET_DEFAULT_SIZES);
MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
"NeonSMGR/prefetch",
ALLOCSET_DEFAULT_SIZES);
MyPState->prf_hash = prfh_create(MyPState->hashctx,
readahead_buffer_size, NULL);
/* noop */
#ifdef DEBUG_COMPARE_LOCAL
mdinit();
#endif
@@ -1643,18 +1004,27 @@ neon_close(SMgrRelation reln, ForkNumber forknum)
}
/*
* neon_reset_prefetch() -- reoe all previously rgistered prefeth requests
*/
void
neon_reset_prefetch(SMgrRelation reln)
{
n_prefetch_requests = 0;
}
/*
* neon_prefetch() -- Initiate asynchronous read of the specified block of a relation
*/
bool
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
BufferTag tag;
uint64 ring_index PG_USED_FOR_ASSERTS_ONLY;
switch (reln->smgr_relpersistence)
{
case 0: /* probably shouldn't happen, but ignore it */
case 0:
/* probably shouldn't happen, but ignore it */
break;
case RELPERSISTENCE_PERMANENT:
break;
@@ -1666,17 +1036,14 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
elog(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
tag = (BufferTag) {
.rnode = reln->smgr_rnode.node,
.forkNum = forknum,
.blockNum = blocknum
};
ring_index = prefetch_register_buffer(tag, NULL, NULL);
Assert(ring_index < MyPState->ring_unused &&
MyPState->ring_last <= ring_index);
if (n_prefetch_requests < MAX_PREFETCH_REQUESTS)
{
prefetch_requests[n_prefetch_requests].rnode = reln->smgr_rnode.node;
prefetch_requests[n_prefetch_requests].forkNum = forknum;
prefetch_requests[n_prefetch_requests].blockNum = blocknum;
n_prefetch_requests += 1;
return true;
}
return false;
}
@@ -1727,72 +1094,81 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, char *buffer)
{
NeonResponse *resp;
BufferTag buftag;
uint64 ring_index;
PrfHashEntry *entry;
PrefetchRequest *slot;
buftag = (BufferTag) {
.rnode = rnode,
.forkNum = forkNum,
.blockNum = blkno,
};
int i;
/*
* Try to find prefetched page in the list of received pages.
* Try to find prefetched page. It is assumed that pages will be requested
* in the same order as them are prefetched, but some other backend may
* load page in shared buffers, so some prefetch responses should be
* skipped.
*/
entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &buftag);
if (entry != NULL)
for (i = n_prefetched_buffers; i < n_prefetch_responses; i++)
{
slot = entry->slot;
if (slot->effective_request_lsn >= request_lsn)
{
ring_index = slot->my_ring_index;
n_prefetch_hits += 1;
}
else /* the current prefetch LSN is not large enough, so drop the prefetch */
resp = page_server->receive();
if (resp->tag == T_NeonGetPageResponse &&
RelFileNodeEquals(prefetch_responses[i].rnode, rnode) &&
prefetch_responses[i].forkNum == forkNum &&
prefetch_responses[i].blockNum == blkno)
{
char *page = ((NeonGetPageResponse *) resp)->page;
/*
* We can't drop cache for not-yet-received requested items. It is
* unlikely this happens, but it can happen if prefetch distance is
* large enough and a backend didn't consume all prefetch requests.
* Check if prefetched page is still relevant. If it is updated by
* some other backend, then it should not be requested from smgr
* unless it is evicted from shared buffers. In the last case
* last_evicted_lsn should be updated and request_lsn should be
* greater than prefetch_lsn. Maximum with page LSN is used
* because page returned by page server may have LSN either
* greater either smaller than requested.
*/
if (slot->status == PRFS_REQUESTED)
if (Max(prefetch_lsn, PageGetLSN(page)) >= request_lsn)
{
prefetch_wait_for(slot->my_ring_index);
n_prefetched_buffers = i + 1;
n_prefetch_hits += 1;
n_prefetch_requests = 0;
memcpy(buffer, page, BLCKSZ);
pfree(resp);
return;
}
/* drop caches */
prefetch_set_unused(slot->my_ring_index);
n_prefetch_missed_caches += 1;
/* make it look like a prefetch cache miss */
entry = NULL;
}
pfree(resp);
}
n_prefetched_buffers = 0;
n_prefetch_responses = 0;
n_prefetch_misses += 1;
{
NeonGetPageRequest request = {
.req.tag = T_NeonGetPageRequest,
.req.latest = request_latest,
.req.lsn = request_lsn,
.rnode = rnode,
.forknum = forkNum,
.blkno = blkno
};
if (n_prefetch_requests > 0)
{
/* Combine all prefetch requests with primary request */
page_server->send((NeonRequest *) & request);
for (i = 0; i < n_prefetch_requests; i++)
{
request.rnode = prefetch_requests[i].rnode;
request.forknum = prefetch_requests[i].forkNum;
request.blkno = prefetch_requests[i].blockNum;
prefetch_responses[i] = prefetch_requests[i];
page_server->send((NeonRequest *) & request);
}
page_server->flush();
n_prefetch_responses = n_prefetch_requests;
n_prefetch_requests = 0;
prefetch_lsn = request_lsn;
resp = page_server->receive();
}
else
{
resp = page_server->request((NeonRequest *) & request);
}
}
do
{
if (entry == NULL)
{
n_prefetch_misses += 1;
ring_index = prefetch_register_buffer(buftag, &request_latest,
&request_lsn);
slot = GetPrfSlot(ring_index);
}
Assert(slot->my_ring_index == ring_index);
Assert(MyPState->ring_last <= ring_index &&
MyPState->ring_unused > ring_index);
Assert(slot->status != PRFS_UNUSED);
Assert(GetPrfSlot(ring_index) == slot);
} while (!prefetch_wait_for(ring_index));
Assert(slot->status == PRFS_RECEIVED);
resp = slot->response;
switch (resp->tag)
{
case T_NeonGetPageResponse:
@@ -1812,13 +1188,12 @@ neon_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
errdetail("page server returned error: %s",
((NeonErrorResponse *) resp)->message)));
break;
default:
elog(ERROR, "unexpected response from page server with tag 0x%02x", resp->tag);
}
/* buffer was used, clean up for later reuse */
prefetch_set_unused(ring_index);
prefetch_cleanup();
pfree(resp);
}
/*
@@ -2440,6 +1815,7 @@ static const struct f_smgr neon_smgr =
.smgr_unlink = neon_unlink,
.smgr_extend = neon_extend,
.smgr_prefetch = neon_prefetch,
.smgr_reset_prefetch = neon_reset_prefetch,
.smgr_read = neon_read,
.smgr_write = neon_write,
.smgr_writeback = neon_writeback,

View File

@@ -75,7 +75,7 @@ static bool syncSafekeepers = false;
char *wal_acceptors_list;
int wal_acceptor_reconnect_timeout;
int wal_acceptor_connection_timeout;
int wal_acceptor_connect_timeout;
bool am_wal_proposer;
char *neon_timeline_walproposer = NULL;
@@ -119,7 +119,6 @@ static TimestampTz last_reconnect_attempt;
static WalproposerShmemState * walprop_shared;
/* Prototypes for private functions */
static void WalProposerRegister(void);
static void WalProposerInit(XLogRecPtr flushRecPtr, uint64 systemId);
static void WalProposerStart(void);
static void WalProposerLoop(void);
@@ -267,9 +266,9 @@ nwp_register_gucs(void)
DefineCustomIntVariable(
"neon.safekeeper_connect_timeout",
"Timeout for connection establishement and it's maintenance against safekeeper",
"Timeout after which give up connection attempt to safekeeper.",
NULL,
&wal_acceptor_connection_timeout,
&wal_acceptor_connect_timeout,
5000, 0, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_MS,
@@ -418,9 +417,7 @@ WalProposerPoll(void)
ResetLatch(MyLatch);
break;
}
now = GetCurrentTimestamp();
if (rc == 0 || TimeToReconnect(now) <= 0) /* timeout expired: poll state */
if (rc == 0) /* timeout expired: poll state */
{
TimestampTz now;
@@ -441,11 +438,13 @@ WalProposerPoll(void)
{
Safekeeper *sk = &safekeeper[i];
if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
wal_acceptor_connection_timeout))
if ((sk->state == SS_CONNECTING_WRITE ||
sk->state == SS_CONNECTING_READ) &&
TimestampDifferenceExceeds(sk->startedConnAt, now,
wal_acceptor_connect_timeout))
{
elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
elog(WARNING, "failed to connect to node '%s:%s': exceeded connection timeout %dms",
sk->host, sk->port, wal_acceptor_connect_timeout);
ShutdownConnection(sk);
}
}
@@ -456,7 +455,7 @@ WalProposerPoll(void)
/*
* Register a background worker proposing WAL to wal acceptors.
*/
static void
void
WalProposerRegister(void)
{
BackgroundWorker bgw;
@@ -761,7 +760,7 @@ ResetConnection(Safekeeper *sk)
elog(LOG, "connecting with node %s:%s", sk->host, sk->port);
sk->state = SS_CONNECTING_WRITE;
sk->latestMsgReceivedAt = GetCurrentTimestamp();
sk->startedConnAt = GetCurrentTimestamp();
sock = walprop_socket(sk->conn);
sk->eventPos = AddWaitEventToSet(waitEvents, WL_SOCKET_WRITEABLE, sock, NULL, sk);
@@ -919,7 +918,7 @@ HandleConnectionEvent(Safekeeper *sk)
case WP_CONN_POLLING_OK:
elog(LOG, "connected with node %s:%s", sk->host,
sk->port);
sk->latestMsgReceivedAt = GetCurrentTimestamp();
/*
* We have to pick some event to update event set. We'll
* eventually need the socket to be readable, so we go with that.
@@ -2305,7 +2304,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage * anymsg)
ResetConnection(sk);
return false;
}
sk->latestMsgReceivedAt = GetCurrentTimestamp();
switch (tag)
{
case 'g':

View File

@@ -30,7 +30,7 @@
extern char *wal_acceptors_list;
extern int wal_acceptor_reconnect_timeout;
extern int wal_acceptor_connection_timeout;
extern int wal_acceptor_connect_timeout;
extern bool am_wal_proposer;
struct WalProposerConn; /* Defined in libpqwalproposer */
@@ -371,24 +371,24 @@ typedef struct Safekeeper
int eventPos; /* position in wait event set. Equal to -1 if*
* no event */
SafekeeperState state; /* safekeeper state machine state */
TimestampTz latestMsgReceivedAt; /* when latest msg is received */
TimestampTz startedConnAt; /* when connection attempt started */
AcceptorGreeting greetResponse; /* acceptor greeting */
VoteResponse voteResponse; /* the vote */
AppendResponse appendResponse; /* feedback for master */
} Safekeeper;
extern void WalProposerSync(int argc, char *argv[]);
extern void WalProposerMain(Datum main_arg);
extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
extern void WalProposerPoll(void);
extern void ParseReplicationFeedbackMessage(StringInfo reply_message,
ReplicationFeedback *rf);
extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
void WalProposerPoll(void);
void WalProposerRegister(void);
void ParseReplicationFeedbackMessage(StringInfo reply_message,
ReplicationFeedback * rf);
extern void StartProposerReplication(StartReplicationCmd *cmd);
extern Size WalproposerShmemSize(void);
extern bool WalproposerShmemInit(void);
extern void replication_feedback_set(ReplicationFeedback *rf);
extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
Size WalproposerShmemSize(void);
bool WalproposerShmemInit(void);
void replication_feedback_set(ReplicationFeedback * rf);
void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
/* libpqwalproposer hooks & helper type */

View File

@@ -93,6 +93,8 @@
PG_MODULE_MAGIC;
void wal_redo_command(char cmd, char const* input, int size, char* output);
static int ReadRedoCommand(StringInfo inBuf);
static void BeginRedoForBlock(StringInfo input_message);
static void PushPage(StringInfo input_message);
@@ -282,7 +284,7 @@ WalRedoMain(int argc, char *argv[])
if (enable_seccomp)
enter_seccomp_mode();
#endif /* HAVE_LIBSECCOMP */
#if 0
/*
* Main processing loop
*/
@@ -351,6 +353,7 @@ WalRedoMain(int argc, char *argv[])
firstchar)));
}
} /* end of input-reading loop */
#endif
}
@@ -733,8 +736,8 @@ redo_block_filter(XLogReaderState *record, uint8 block_id)
*
* After applying some records.
*/
static void
GetPage(StringInfo input_message)
static void*
GetPage(StringInfo input_message, char* dst)
{
RelFileNode rnode;
ForkNumber forknum;
@@ -763,8 +766,8 @@ GetPage(StringInfo input_message)
buf = NeonRedoReadBuffer(rnode, forknum, blknum, RBM_NORMAL);
Assert(buf == wal_redo_buffer);
page = BufferGetPage(buf);
/* single thread, so don't bother locking the page */
memcpy(dst, page, BLCKSZ);
#if 0
/* Response: Page content */
tot_written = 0;
do {
@@ -781,7 +784,7 @@ GetPage(StringInfo input_message)
}
tot_written += rc;
} while (tot_written < BLCKSZ);
#endif
ReleaseBuffer(buf);
DropRelFileNodeAllLocalBuffers(rnode);
wal_redo_buffer = InvalidBuffer;
@@ -845,3 +848,31 @@ buffered_read(void *buf, size_t count)
return (dst - (char *) buf);
}
void wal_redo_input(char cmd, char const* input, int size, char* output)
{
StringInfoData input_message;
input_message.data = input;
input_message.len = input_message.maxlen = size;
input_message.cursor = 0;
switch (cmd)
{
case 'B': /* BeginRedoForBlock */
BeginRedoForBlock(&input_message);
break;
case 'P': /* PushPage */
PushPage(&input_message);
break;
case 'A': /* ApplyRecord */
ApplyRecord(&input_message);
break;
case 'G':
GetPage(&input_message, output);
break;
}
}

87
poetry.lock generated
View File

@@ -1077,17 +1077,6 @@ python-versions = ">=3.6"
[package.extras]
twisted = ["twisted"]
[[package]]
name = "psutil"
version = "5.9.4"
description = "Cross-platform lib for process and system monitoring in Python."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
[package.extras]
test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
[[package]]
name = "psycopg2-binary"
version = "2.9.3"
@@ -1218,6 +1207,18 @@ pytest = ">=6.1.0"
[package.extras]
testing = ["coverage (>=6.2)", "flaky (>=3.5.0)", "hypothesis (>=5.7.1)", "mypy (>=0.931)", "pytest-trio (>=0.7.0)"]
[[package]]
name = "pytest-forked"
version = "1.4.0"
description = "run tests in isolated forked subprocesses"
category = "main"
optional = false
python-versions = ">=3.6"
[package.dependencies]
py = "*"
pytest = ">=3.10"
[[package]]
name = "pytest-lazy-fixture"
version = "0.6.3"
@@ -1239,8 +1240,8 @@ python-versions = ">=3.6"
[package.dependencies]
pytest = [
{version = ">=5.0", markers = "python_version < \"3.10\""},
{version = ">=6.2.4", markers = "python_version >= \"3.10\""},
{version = ">=5.0", markers = "python_version < \"3.10\""},
]
[[package]]
@@ -1256,7 +1257,7 @@ pytest = ">=5.0.0"
[[package]]
name = "pytest-xdist"
version = "3.0.2"
version = "2.5.0"
description = "pytest xdist plugin for distributed testing and loop-on-failing modes"
category = "main"
optional = false
@@ -1265,6 +1266,7 @@ python-versions = ">=3.6"
[package.dependencies]
execnet = ">=1.1"
pytest = ">=6.2.0"
pytest-forked = "*"
[package.extras]
psutil = ["psutil (>=3.0)"]
@@ -1447,14 +1449,6 @@ category = "dev"
optional = false
python-versions = ">=3.7"
[[package]]
name = "types-psutil"
version = "5.9.5.4"
description = "Typing stubs for psutil"
category = "main"
optional = false
python-versions = "*"
[[package]]
name = "types-psycopg2"
version = "2.9.18"
@@ -1574,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
[metadata]
lock-version = "1.1"
python-versions = "^3.9"
content-hash = "c95c184fccaf40815405ad616ec1c55869c7f87b72777cc3a9cbaff41de98977"
content-hash = "9352a89d49d34807f6a58f6c3f898acbd8cf3570e0f45ede973673644bde4d0e"
[metadata.files]
aiopg = [
@@ -1985,26 +1979,9 @@ prometheus-client = [
{file = "prometheus_client-0.14.1-py3-none-any.whl", hash = "sha256:522fded625282822a89e2773452f42df14b5a8e84a86433e3f8a189c1d54dc01"},
{file = "prometheus_client-0.14.1.tar.gz", hash = "sha256:5459c427624961076277fdc6dc50540e2bacb98eebde99886e59ec55ed92093a"},
]
psutil = [
{file = "psutil-5.9.4-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:c1ca331af862803a42677c120aff8a814a804e09832f166f226bfd22b56feee8"},
{file = "psutil-5.9.4-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:68908971daf802203f3d37e78d3f8831b6d1014864d7a85937941bb35f09aefe"},
{file = "psutil-5.9.4-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:3ff89f9b835100a825b14c2808a106b6fdcc4b15483141482a12c725e7f78549"},
{file = "psutil-5.9.4-cp27-cp27m-win32.whl", hash = "sha256:852dd5d9f8a47169fe62fd4a971aa07859476c2ba22c2254d4a1baa4e10b95ad"},
{file = "psutil-5.9.4-cp27-cp27m-win_amd64.whl", hash = "sha256:9120cd39dca5c5e1c54b59a41d205023d436799b1c8c4d3ff71af18535728e94"},
{file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6b92c532979bafc2df23ddc785ed116fced1f492ad90a6830cf24f4d1ea27d24"},
{file = "psutil-5.9.4-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:efeae04f9516907be44904cc7ce08defb6b665128992a56957abc9b61dca94b7"},
{file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"},
{file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"},
{file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"},
{file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"},
{file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"},
{file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"},
{file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"},
]
psycopg2-binary = [
{file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
{file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
{file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
{file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -2038,7 +2015,6 @@ psycopg2-binary = [
{file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
{file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
{file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
{file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
{file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -2050,7 +2026,6 @@ psycopg2-binary = [
{file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
{file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
{file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
{file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
{file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -2067,7 +2042,18 @@ py = [
{file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"},
]
pyasn1 = [
{file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
{file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
{file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
{file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
{file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
{file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
{file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
{file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
{file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
{file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
{file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
{file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
{file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
]
pycodestyle = [
@@ -2125,6 +2111,10 @@ pytest-asyncio = [
{file = "pytest-asyncio-0.19.0.tar.gz", hash = "sha256:ac4ebf3b6207259750bc32f4c1d8fcd7e79739edbc67ad0c58dd150b1d072fed"},
{file = "pytest_asyncio-0.19.0-py3-none-any.whl", hash = "sha256:7a97e37cfe1ed296e2e84941384bdd37c376453912d397ed39293e0916f521fa"},
]
pytest-forked = [
{file = "pytest-forked-1.4.0.tar.gz", hash = "sha256:8b67587c8f98cbbadfdd804539ed5455b6ed03802203485dd2f53c1422d7440e"},
{file = "pytest_forked-1.4.0-py3-none-any.whl", hash = "sha256:bbbb6717efc886b9d64537b41fb1497cfaf3c9601276be8da2cccfea5a3c8ad8"},
]
pytest-lazy-fixture = [
{file = "pytest-lazy-fixture-0.6.3.tar.gz", hash = "sha256:0e7d0c7f74ba33e6e80905e9bfd81f9d15ef9a790de97993e34213deb5ad10ac"},
{file = "pytest_lazy_fixture-0.6.3-py3-none-any.whl", hash = "sha256:e0b379f38299ff27a653f03eaa69b08a6fd4484e46fd1c9907d984b9f9daeda6"},
@@ -2138,8 +2128,8 @@ pytest-timeout = [
{file = "pytest_timeout-2.1.0-py3-none-any.whl", hash = "sha256:f6f50101443ce70ad325ceb4473c4255e9d74e3c7cd0ef827309dfa4c0d975c6"},
]
pytest-xdist = [
{file = "pytest-xdist-3.0.2.tar.gz", hash = "sha256:688da9b814370e891ba5de650c9327d1a9d861721a524eb917e620eec3e90291"},
{file = "pytest_xdist-3.0.2-py3-none-any.whl", hash = "sha256:9feb9a18e1790696ea23e1434fa73b325ed4998b0e9fcb221f16fd1945e6df1b"},
{file = "pytest-xdist-2.5.0.tar.gz", hash = "sha256:4580deca3ff04ddb2ac53eba39d76cb5dd5edeac050cb6fbc768b0dd712b4edf"},
{file = "pytest_xdist-2.5.0-py3-none-any.whl", hash = "sha256:6fe5c74fec98906deb8f2d2b616b5c782022744978e7bd4695d39c8f42d0ce65"},
]
python-dateutil = [
{file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"},
@@ -2173,13 +2163,6 @@ pyyaml = [
{file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
{file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
{file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
{file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
{file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
{file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
{file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
{file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
{file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
{file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
{file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2247,10 +2230,6 @@ tomli = [
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
types-psutil = [
{file = "types-psutil-5.9.5.4.tar.gz", hash = "sha256:aa09102b80c65a3b4573216614372398dab78972d650488eaff1ff05482cc18f"},
{file = "types_psutil-5.9.5.4-py3-none-any.whl", hash = "sha256:28e59764630187e462d43788efa16d59d5e77b510115f9e25901b2d4007fca62"},
]
types-psycopg2 = [
{file = "types-psycopg2-2.9.18.tar.gz", hash = "sha256:9b0e9e1f097b15cd9fa8aad2596a9e3082fd72f8d9cfe52b190cfa709105b6c0"},
{file = "types_psycopg2-2.9.18-py3-none-any.whl", hash = "sha256:14c779dcab18c31453fa1cad3cf4b1601d33540a344adead3c47a6b8091cd2fa"},

View File

@@ -1,7 +1,7 @@
//! Client authentication mechanisms.
pub mod backend;
pub use backend::{BackendType, ConsoleReqExtra};
pub use backend::{BackendType, ConsoleReqExtra, DatabaseInfo};
mod credentials;
pub use credentials::ClientCredentials;

View File

@@ -12,6 +12,7 @@ use crate::{
waiters::{self, Waiter, Waiters},
};
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::borrow::Cow;
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::{info, warn};
@@ -35,6 +36,45 @@ pub fn notify(psql_session_id: &str, msg: mgmt::ComputeReady) -> Result<(), wait
CPLANE_WAITERS.notify(psql_session_id, msg)
}
/// Compute node connection params provided by the cloud.
/// Note how it implements serde traits, since we receive it over the wire.
#[derive(Serialize, Deserialize, Default)]
pub struct DatabaseInfo {
pub host: String,
pub port: u16,
pub dbname: String,
pub user: String,
pub password: Option<String>,
}
// Manually implement debug to omit personal and sensitive info.
impl std::fmt::Debug for DatabaseInfo {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
fmt.debug_struct("DatabaseInfo")
.field("host", &self.host)
.field("port", &self.port)
.finish_non_exhaustive()
}
}
impl From<DatabaseInfo> for tokio_postgres::Config {
fn from(db_info: DatabaseInfo) -> Self {
let mut config = tokio_postgres::Config::new();
config
.host(&db_info.host)
.port(db_info.port)
.dbname(&db_info.dbname)
.user(&db_info.user);
if let Some(password) = db_info.password {
config.password(password);
}
config
}
}
/// Extra query params we'd like to pass to the console.
pub struct ConsoleReqExtra<'a> {
/// A unique identifier for a connection.
@@ -118,107 +158,54 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
}
}
/// A product of successful authentication.
pub struct AuthSuccess<T> {
/// Did we send [`pq_proto::BeMessage::AuthenticationOk`] to client?
pub reported_auth_ok: bool,
/// Something to be considered a positive result.
pub value: T,
}
impl<T> AuthSuccess<T> {
/// Very similar to [`std::option::Option::map`].
/// Maps [`AuthSuccess<T>`] to [`AuthSuccess<R>`] by applying
/// a function to a contained value.
pub fn map<R>(self, f: impl FnOnce(T) -> R) -> AuthSuccess<R> {
AuthSuccess {
reported_auth_ok: self.reported_auth_ok,
value: f(self.value),
}
}
}
/// Info for establishing a connection to a compute node.
/// This is what we get after auth succeeded, but not before!
pub struct NodeInfo {
/// Project from [`auth::ClientCredentials`].
pub project: String,
/// Compute node connection params.
pub config: compute::ConnCfg,
}
impl BackendType<'_, ClientCredentials<'_>> {
/// Do something special if user didn't provide the `project` parameter.
async fn try_password_hack(
&mut self,
extra: &ConsoleReqExtra<'_>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
) -> auth::Result<Option<AuthSuccess<NodeInfo>>> {
use BackendType::*;
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the project name.
// We now expect to see a very specific payload in the place of password.
let fetch_magic_payload = async {
warn!("project name not specified, resorting to the password hack auth flow");
let payload = AuthFlow::new(client)
.begin(auth::PasswordHack)
.await?
.authenticate()
.await?;
info!(project = &payload.project, "received missing parameter");
auth::Result::Ok(payload)
};
// TODO: find a proper way to merge those very similar blocks.
let (mut config, payload) = match self {
Console(endpoint, creds) if creds.project.is_none() => {
let payload = fetch_magic_payload.await?;
let mut creds = creds.as_ref();
creds.project = Some(payload.project.as_str().into());
let config = console::Api::new(endpoint, extra, &creds)
.wake_compute()
.await?;
(config, payload)
}
Postgres(endpoint, creds) if creds.project.is_none() => {
let payload = fetch_magic_payload.await?;
let mut creds = creds.as_ref();
creds.project = Some(payload.project.as_str().into());
let config = postgres::Api::new(endpoint, &creds).wake_compute().await?;
(config, payload)
}
_ => return Ok(None),
};
config.password(payload.password);
Ok(Some(AuthSuccess {
reported_auth_ok: false,
value: NodeInfo {
project: payload.project,
config,
},
}))
}
/// Authenticate the client via the requested backend, possibly using credentials.
pub async fn authenticate(
mut self,
extra: &ConsoleReqExtra<'_>,
client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
) -> auth::Result<AuthSuccess<NodeInfo>> {
) -> super::Result<compute::NodeInfo> {
use BackendType::*;
// Handle cases when `project` is missing in `creds`.
// TODO: type safety: return `creds` with irrefutable `project`.
if let Some(res) = self.try_password_hack(extra, client).await? {
info!("user successfully authenticated (using the password hack)");
return Ok(res);
if let Console(_, creds) | Postgres(_, creds) = &mut self {
// If there's no project so far, that entails that client doesn't
// support SNI or other means of passing the project name.
// We now expect to see a very specific payload in the place of password.
if creds.project().is_none() {
warn!("project name not specified, resorting to the password hack auth flow");
let payload = AuthFlow::new(client)
.begin(auth::PasswordHack)
.await?
.authenticate()
.await?;
// Finally we may finish the initialization of `creds`.
// TODO: add missing type safety to ClientCredentials.
info!(project = &payload.project, "received missing parameter");
creds.project = Some(payload.project.into());
let mut config = match &self {
Console(endpoint, creds) => {
console::Api::new(endpoint, extra, creds)
.wake_compute()
.await?
}
Postgres(endpoint, creds) => {
postgres::Api::new(endpoint, creds).wake_compute().await?
}
_ => unreachable!("see the patterns above"),
};
// We should use a password from payload as well.
config.password(payload.password);
info!("user successfully authenticated (using the password hack)");
return Ok(compute::NodeInfo {
reported_auth_ok: false,
config,
});
}
}
let res = match self {
@@ -228,34 +215,22 @@ impl BackendType<'_, ClientCredentials<'_>> {
project = creds.project(),
"performing authentication using the console"
);
assert!(creds.project.is_some());
console::Api::new(&endpoint, extra, &creds)
.handle_user(client)
.await?
.map(|config| NodeInfo {
project: creds.project.unwrap().into_owned(),
config,
})
.await
}
Postgres(endpoint, creds) => {
info!("performing mock authentication using a local postgres instance");
assert!(creds.project.is_some());
postgres::Api::new(&endpoint, &creds)
.handle_user(client)
.await?
.map(|config| NodeInfo {
project: creds.project.unwrap().into_owned(),
config,
})
.await
}
// NOTE: this auth backend doesn't use client credentials.
Link(url) => {
info!("performing link authentication");
link::handle_user(&url, client).await?
link::handle_user(&url, client).await
}
};
}?;
info!("user successfully authenticated");
Ok(res)

View File

@@ -1,9 +1,9 @@
//! Cloud API V2.
use super::{AuthSuccess, ConsoleReqExtra};
use super::ConsoleReqExtra;
use crate::{
auth::{self, AuthFlow, ClientCredentials},
compute,
compute::{self, ComputeConnCfg},
error::{io_error, UserFacingError},
http, scram,
stream::PqStream,
@@ -128,7 +128,7 @@ impl<'a> Api<'a> {
pub(super) async fn handle_user(
self,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
) -> auth::Result<compute::NodeInfo> {
handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
}
@@ -164,7 +164,7 @@ impl<'a> Api<'a> {
}
/// Wake up the compute node and return the corresponding connection info.
pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
let request_id = uuid::Uuid::new_v4().to_string();
let req = self
.endpoint
@@ -195,7 +195,7 @@ impl<'a> Api<'a> {
Some(x) => x,
};
let mut config = compute::ConnCfg::new();
let mut config = ComputeConnCfg::new();
config
.host(host)
.port(port)
@@ -213,10 +213,10 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
endpoint: &'a Endpoint,
get_auth_info: impl FnOnce(&'a Endpoint) -> GetAuthInfo,
wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
) -> auth::Result<AuthSuccess<compute::ConnCfg>>
) -> auth::Result<compute::NodeInfo>
where
GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
WakeCompute: Future<Output = Result<compute::ConnCfg, WakeComputeError>>,
WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
{
info!("fetching user's authentication info");
let auth_info = get_auth_info(endpoint).await?;
@@ -243,9 +243,9 @@ where
config.auth_keys(tokio_postgres::config::AuthKeys::ScramSha256(keys));
}
Ok(AuthSuccess {
Ok(compute::NodeInfo {
reported_auth_ok: false,
value: config,
config,
})
}

View File

@@ -1,4 +1,3 @@
use super::{AuthSuccess, NodeInfo};
use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
use pq_proto::{BeMessage as Be, BeParameterStatusMessage};
use thiserror::Error;
@@ -50,7 +49,7 @@ pub fn new_psql_session_id() -> String {
pub async fn handle_user(
link_uri: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
) -> auth::Result<AuthSuccess<NodeInfo>> {
) -> auth::Result<compute::NodeInfo> {
let psql_session_id = new_psql_session_id();
let span = info_span!("link", psql_session_id = &psql_session_id);
let greeting = hello_message(link_uri, &psql_session_id);
@@ -72,22 +71,8 @@ pub async fn handle_user(
client.write_message_noflush(&Be::NoticeResponse("Connecting to database."))?;
let mut config = compute::ConnCfg::new();
config
.host(&db_info.host)
.port(db_info.port)
.dbname(&db_info.dbname)
.user(&db_info.user);
if let Some(password) = db_info.password {
config.password(password);
}
Ok(AuthSuccess {
Ok(compute::NodeInfo {
reported_auth_ok: true,
value: NodeInfo {
project: db_info.project,
config,
},
config: db_info.into(),
})
}

View File

@@ -1,12 +1,12 @@
//! Local mock of Cloud API V2.
use super::{
console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
AuthSuccess,
};
use crate::{
auth::{self, ClientCredentials},
compute,
auth::{
self,
backend::console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
ClientCredentials,
},
compute::{self, ComputeConnCfg},
error::io_error,
scram,
stream::PqStream,
@@ -37,7 +37,7 @@ impl<'a> Api<'a> {
pub(super) async fn handle_user(
self,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
) -> auth::Result<AuthSuccess<compute::ConnCfg>> {
) -> auth::Result<compute::NodeInfo> {
// We reuse user handling logic from a production module.
console::handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
}
@@ -82,8 +82,8 @@ impl<'a> Api<'a> {
}
/// We don't need to wake anything locally, so we just return the connection info.
pub(super) async fn wake_compute(&self) -> Result<compute::ConnCfg, WakeComputeError> {
let mut config = compute::ConnCfg::new();
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
let mut config = ComputeConnCfg::new();
config
.host(self.endpoint.host_str().unwrap_or("localhost"))
.port(self.endpoint.port().unwrap_or(5432))

View File

@@ -36,23 +36,11 @@ pub struct ClientCredentials<'a> {
}
impl ClientCredentials<'_> {
#[inline]
pub fn project(&self) -> Option<&str> {
self.project.as_deref()
}
}
impl<'a> ClientCredentials<'a> {
#[inline]
pub fn as_ref(&'a self) -> ClientCredentials<'a> {
Self {
user: self.user,
dbname: self.dbname,
project: self.project().map(Cow::Borrowed),
}
}
}
impl<'a> ClientCredentials<'a> {
pub fn parse(
params: &'a StartupMessageParams,

View File

@@ -40,36 +40,17 @@ impl UserFacingError for ConnectionError {
/// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
pub type ScramKeys = tokio_postgres::config::ScramKeys<32>;
/// A config for establishing a connection to compute node.
/// Eventually, `tokio_postgres` will be replaced with something better.
/// Newtype allows us to implement methods on top of it.
#[repr(transparent)]
pub struct ConnCfg(pub tokio_postgres::Config);
pub type ComputeConnCfg = tokio_postgres::Config;
impl ConnCfg {
/// Construct a new connection config.
pub fn new() -> Self {
Self(tokio_postgres::Config::new())
}
/// Various compute node info for establishing connection etc.
pub struct NodeInfo {
/// Did we send [`pq_proto::BeMessage::AuthenticationOk`]?
pub reported_auth_ok: bool,
/// Compute node connection params.
pub config: tokio_postgres::Config,
}
impl std::ops::Deref for ConnCfg {
type Target = tokio_postgres::Config;
fn deref(&self) -> &Self::Target {
&self.0
}
}
/// For now, let's make it easier to setup the config.
impl std::ops::DerefMut for ConnCfg {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl ConnCfg {
/// Establish a raw TCP connection to the compute node.
impl NodeInfo {
async fn connect_raw(&self) -> io::Result<(SocketAddr, TcpStream)> {
use tokio_postgres::config::Host;
@@ -87,8 +68,8 @@ impl ConnCfg {
// because it has no means for extracting the underlying socket which we
// require for our business.
let mut connection_error = None;
let ports = self.0.get_ports();
let hosts = self.0.get_hosts();
let ports = self.config.get_ports();
let hosts = self.config.get_hosts();
// the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array
if ports.len() > 1 && ports.len() != hosts.len() {
return Err(io::Error::new(
@@ -96,7 +77,7 @@ impl ConnCfg {
format!(
"couldn't connect: bad compute config, \
ports and hosts entries' count does not match: {:?}",
self.0
self.config
),
));
}
@@ -122,7 +103,7 @@ impl ConnCfg {
Err(connection_error.unwrap_or_else(|| {
io::Error::new(
io::ErrorKind::Other,
format!("couldn't connect: bad compute config: {:?}", self.0),
format!("couldn't connect: bad compute config: {:?}", self.config),
)
}))
}
@@ -135,7 +116,7 @@ pub struct PostgresConnection {
pub version: String,
}
impl ConnCfg {
impl NodeInfo {
/// Connect to a corresponding compute node.
pub async fn connect(
mut self,
@@ -149,21 +130,21 @@ impl ConnCfg {
.intersperse(" ") // TODO: use impl from std once it's stabilized
.collect();
self.0.options(&options);
self.config.options(&options);
}
if let Some(app_name) = params.get("application_name") {
self.0.application_name(app_name);
self.config.application_name(app_name);
}
if let Some(replication) = params.get("replication") {
use tokio_postgres::config::ReplicationMode;
match replication {
"true" | "on" | "yes" | "1" => {
self.0.replication_mode(ReplicationMode::Physical);
self.config.replication_mode(ReplicationMode::Physical);
}
"database" => {
self.0.replication_mode(ReplicationMode::Logical);
self.config.replication_mode(ReplicationMode::Logical);
}
_other => {}
}
@@ -179,7 +160,7 @@ impl ConnCfg {
.map_err(|_| ConnectionError::FailedToConnectToCompute)?;
// TODO: establish a secure connection to the DB
let (client, conn) = self.0.connect_raw(&mut stream, NoTls).await?;
let (client, conn) = self.config.connect_raw(&mut stream, NoTls).await?;
let version = conn
.parameter("server_version")
.ok_or(ConnectionError::FailedToFetchPgVersion)?

View File

@@ -6,11 +6,16 @@ use std::{
net::{TcpListener, TcpStream},
thread,
};
use tracing::{error, info, info_span};
use tracing::{error, info};
use utils::postgres_backend::{self, AuthType, PostgresBackend};
/// Console management API listener thread.
/// It spawns console response handlers needed for the link auth.
/// TODO: move all of that to auth-backend/link.rs when we ditch legacy-console backend
///
/// Main proxy listener loop.
///
/// Listens for connections, and launches a new handler thread for each.
///
pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
scopeguard::defer! {
info!("mgmt has shut down");
@@ -19,7 +24,6 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
listener
.set_nonblocking(false)
.context("failed to set listener to blocking")?;
loop {
let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
info!("accepted connection from {peer_addr}");
@@ -27,19 +31,9 @@ pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
.set_nodelay(true)
.context("failed to set client socket option")?;
// TODO: replace with async tasks.
thread::spawn(move || {
let tid = std::thread::current().id();
let span = info_span!("mgmt", thread = format_args!("{tid:?}"));
let _enter = span.enter();
info!("started a new console management API thread");
scopeguard::defer! {
info!("console management API thread is about to finish");
}
if let Err(e) = handle_connection(socket) {
error!("thread failed with an error: {e}");
if let Err(err) = handle_connection(socket) {
error!("{err}");
}
});
}
@@ -50,21 +44,44 @@ fn handle_connection(socket: TcpStream) -> anyhow::Result<()> {
pgbackend.run(&mut MgmtHandler)
}
/// Known as `kickResponse` in the console.
#[derive(Debug, Deserialize)]
struct MgmtHandler;
/// Serialized examples:
// {
// "session_id": "71d6d03e6d93d99a",
// "result": {
// "Success": {
// "host": "127.0.0.1",
// "port": 5432,
// "dbname": "stas",
// "user": "stas",
// "password": "mypass"
// }
// }
// }
// {
// "session_id": "71d6d03e6d93d99a",
// "result": {
// "Failure": "oops"
// }
// }
//
// // to test manually by sending a query to mgmt interface:
// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
#[derive(Deserialize)]
struct PsqlSessionResponse {
session_id: String,
result: PsqlSessionResult,
}
#[derive(Debug, Deserialize)]
#[derive(Deserialize)]
enum PsqlSessionResult {
Success(DatabaseInfo),
Success(auth::DatabaseInfo),
Failure(String),
}
/// A message received by `mgmt` when a compute node is ready.
pub type ComputeReady = Result<DatabaseInfo, String>;
pub type ComputeReady = Result<auth::DatabaseInfo, String>;
impl PsqlSessionResult {
fn into_compute_ready(self) -> ComputeReady {
@@ -75,51 +92,25 @@ impl PsqlSessionResult {
}
}
/// Compute node connection params provided by the console.
/// This struct and its parents are mgmt API implementation
/// detail and thus should remain in this module.
// TODO: restore deserialization tests from git history.
#[derive(Deserialize)]
pub struct DatabaseInfo {
pub host: String,
pub port: u16,
pub dbname: String,
pub user: String,
/// Console always provides a password, but it might
/// be inconvenient for debug with local PG instance.
pub password: Option<String>,
pub project: String,
}
// Manually implement debug to omit sensitive info.
impl std::fmt::Debug for DatabaseInfo {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
fmt.debug_struct("DatabaseInfo")
.field("host", &self.host)
.field("port", &self.port)
.field("dbname", &self.dbname)
.field("user", &self.user)
.finish_non_exhaustive()
}
}
// TODO: replace with an http-based protocol.
struct MgmtHandler;
impl postgres_backend::Handler for MgmtHandler {
fn process_query(&mut self, pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
try_process_query(pgb, query).map_err(|e| {
error!("failed to process response: {e:?}");
e
})
fn process_query(
&mut self,
pgb: &mut PostgresBackend,
query_string: &str,
) -> anyhow::Result<()> {
let res = try_process_query(pgb, query_string);
// intercept and log error message
if res.is_err() {
error!("mgmt query failed: {res:?}");
}
res
}
}
fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<()> {
let resp: PsqlSessionResponse = serde_json::from_str(query)?;
fn try_process_query(pgb: &mut PostgresBackend, query_string: &str) -> anyhow::Result<()> {
info!("got mgmt query [redacted]"); // Content contains password, don't print it
let span = info_span!("event", session_id = resp.session_id);
let _enter = span.enter();
info!("got response: {:?}", resp.result);
let resp: PsqlSessionResponse = serde_json::from_str(query_string)?;
match auth::backend::notify(&resp.session_id, resp.result.into_compute_ready()) {
Ok(()) => {
@@ -128,50 +119,9 @@ fn try_process_query(pgb: &mut PostgresBackend, query: &str) -> anyhow::Result<(
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
}
Err(e) => {
error!("failed to deliver response to per-client task");
pgb.write_message(&BeMessage::ErrorResponse(&e.to_string()))?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn parse_db_info() -> anyhow::Result<()> {
// with password
let _: DatabaseInfo = serde_json::from_value(json!({
"host": "localhost",
"port": 5432,
"dbname": "postgres",
"user": "john_doe",
"password": "password",
"project": "hello_world",
}))?;
// without password
let _: DatabaseInfo = serde_json::from_value(json!({
"host": "localhost",
"port": 5432,
"dbname": "postgres",
"user": "john_doe",
"project": "hello_world",
}))?;
// new field (forward compatibility)
let _: DatabaseInfo = serde_json::from_value(json!({
"host": "localhost",
"port": 5432,
"dbname": "postgres",
"user": "john_doe",
"project": "hello_world",
"N.E.W": "forward compatibility check",
}))?;
Ok(())
}
}

View File

@@ -4,7 +4,7 @@ use crate::config::{ProxyConfig, TlsConfig};
use crate::stream::{MeasuredStream, PqStream, Stream};
use anyhow::{bail, Context};
use futures::TryFutureExt;
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use metrics::{register_int_counter, IntCounter};
use once_cell::sync::Lazy;
use pq_proto::{BeMessage as Be, *};
use std::sync::Arc;
@@ -30,16 +30,10 @@ static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
.unwrap()
});
static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
register_int_counter_vec!(
"proxy_io_bytes_per_client",
"Number of bytes sent/received between client and backend.",
&[
// Received (rx) / sent (tx).
"direction",
// Proxy can keep calling it `project` internally.
"endpoint_id"
]
static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"proxy_io_bytes_total",
"Number of bytes sent/received between any client and backend."
)
.unwrap()
});
@@ -236,17 +230,16 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
application_name: params.get("application_name"),
};
let auth_result = async {
// `&mut stream` doesn't let us merge those 2 lines.
let res = creds.authenticate(&extra, &mut stream).await;
async { res }.or_else(|e| stream.throw_error(e)).await
}
.instrument(info_span!("auth"))
.await?;
// Authenticate and connect to a compute node.
let auth = creds
.authenticate(&extra, &mut stream)
.instrument(info_span!("auth"))
.await;
let node = async { auth }.or_else(|e| stream.throw_error(e)).await?;
let reported_auth_ok = node.reported_auth_ok;
let node = auth_result.value;
let (db, cancel_closure) = node
.config
.connect(params)
.or_else(|e| stream.throw_error(e))
.await?;
@@ -254,9 +247,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
let cancel_key_data = session.enable_query_cancellation(cancel_closure);
// Report authentication success if we haven't done this already.
// Note that we do this only (for the most part) after we've connected
// to a compute (see above) which performs its own authentication.
if !auth_result.reported_auth_ok {
if !reported_auth_ok {
stream
.write_message_noflush(&Be::AuthenticationOk)?
.write_message_noflush(&BeParameterStatusMessage::encoding())?;
@@ -270,23 +261,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
.write_message(&BeMessage::ReadyForQuery)
.await?;
// TODO: add more identifiers.
let metric_id = node.project;
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx", &metric_id]);
let mut client = MeasuredStream::new(stream.into_inner(), |cnt| {
// Number of bytes we sent to the client (outbound).
m_sent.inc_by(cnt as u64);
});
let m_recv = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["rx", &metric_id]);
let mut db = MeasuredStream::new(db.stream, |cnt| {
// Number of bytes the client sent to the compute node (inbound).
m_recv.inc_by(cnt as u64);
});
/// This function will be called for writes to either direction.
fn inc_proxied(cnt: usize) {
// Consider inventing something more sophisticated
// if this ever becomes a bottleneck (cacheline bouncing).
NUM_BYTES_PROXIED_COUNTER.inc_by(cnt as u64);
}
// Starting from here we only proxy the client's traffic.
info!("performing the proxy pass...");
let mut db = MeasuredStream::new(db.stream, inc_proxied);
let mut client = MeasuredStream::new(stream.into_inner(), inc_proxied);
let _ = tokio::io::copy_bidirectional(&mut client, &mut db).await?;
Ok(())

View File

@@ -11,7 +11,7 @@ psycopg2-binary = "^2.9.1"
typing-extensions = "^4.1.0"
PyJWT = {version = "^2.1.0", extras = ["crypto"]}
requests = "^2.26.0"
pytest-xdist = "^3.0.2"
pytest-xdist = "^2.3.0"
asyncpg = "^0.24.0"
aiopg = "^1.3.1"
Jinja2 = "^3.0.2"
@@ -29,8 +29,6 @@ pytest-order = "^1.0.1"
allure-pytest = "^2.10.0"
pytest-asyncio = "^0.19.0"
toml = "^0.10.2"
psutil = "^5.9.4"
types-psutil = "^5.9.5.4"
[tool.poetry.dev-dependencies]
flake8 = "^5.0.4"

View File

@@ -556,6 +556,10 @@ impl Timeline {
.pageserver_feedback
.map(|f| Lsn(f.ps_applylsn))
.unwrap_or(Lsn::INVALID);
info!(
"checking should ws stop ttid {} lsn {} rcl {}",
self.ttid, reported_remote_consistent_lsn, shared_state.sk.inmem.commit_lsn
);
let stop = shared_state.sk.inmem.commit_lsn == Lsn(0) || // no data at all yet
(reported_remote_consistent_lsn!= Lsn::MAX && // Lsn::MAX means that we don't know the latest LSN yet.
reported_remote_consistent_lsn >= shared_state.sk.inmem.commit_lsn);

51
scripts/docker-compose_test.sh Executable file
View File

@@ -0,0 +1,51 @@
#!/bin/bash
# this is a shortcut script to avoid duplication in CI
set -eux -o pipefail
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
COMPUTE_CONTAINER_NAME=dockercompose_compute_1
SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
cleanup() {
echo "show container information"
docker ps
docker-compose -f $COMPOSE_FILE logs
echo "stop containers..."
docker-compose -f $COMPOSE_FILE down
}
echo "clean up containers if exists"
cleanup
for pg_version in 14 15; do
echo "start containers (pg_version=$pg_version)."
PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
echo "wait until the compute is ready. timeout after 60s. "
cnt=0
while sleep 1; do
# check timeout
cnt=`expr $cnt + 1`
if [ $cnt -gt 60 ]; then
echo "timeout before the compute is ready."
cleanup
exit 1
fi
# check if the compute is ready
set +o pipefail
result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
set -o pipefail
if [ $result -eq 1 ]; then
echo "OK. The compute is ready to connect."
echo "execute simple queries."
docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
cleanup
break
fi
done
done

View File

@@ -11,37 +11,39 @@ from datetime import datetime
from pathlib import Path
# Type-related stuff
from typing import Callable, ClassVar, Iterator, Optional
from typing import Iterator, Optional
import pytest
from _pytest.config import Config
from _pytest.config.argparsing import Parser
from _pytest.terminal import TerminalReporter
from fixtures.neon_fixtures import NeonPageserver
from fixtures.types import TenantId, TimelineId
"""
This file contains fixtures for micro-benchmarks.
To use, declare the `zenbenchmark` fixture in the test function. Run the
bencmark, and then record the result by calling `zenbenchmark.record`. For example:
To use, declare the 'zenbenchmark' fixture in the test function. Run the
bencmark, and then record the result by calling zenbenchmark.record. For example:
>>> import timeit
>>> from fixtures.neon_fixtures import NeonEnv
>>> def test_mybench(neon_simple_env: NeonEnv, zenbenchmark):
... # Initialize the test
... ...
... # Run the test, timing how long it takes
... with zenbenchmark.record_duration('test_query'):
... cur.execute('SELECT test_query(...)')
... # Record another measurement
... zenbenchmark.record('speed_of_light', 300000, 'km/s')
import timeit
from fixtures.neon_fixtures import NeonEnv
def test_mybench(neon_simple_env: env, zenbenchmark):
# Initialize the test
...
# Run the test, timing how long it takes
with zenbenchmark.record_duration('test_query'):
cur.execute('SELECT test_query(...)')
# Record another measurement
zenbenchmark.record('speed_of_light', 300000, 'km/s')
There's no need to import this file to use it. It should be declared as a plugin
inside `conftest.py`, and that makes it available to all tests.
inside conftest.py, and that makes it available to all tests.
You can measure multiple things in one test, and record each one with a separate
call to `zenbenchmark`. For example, you could time the bulk loading that happens
call to zenbenchmark. For example, you could time the bulk loading that happens
in the test initialization, or measure disk usage after the test query.
"""
@@ -115,7 +117,7 @@ class PgBenchRunResult:
# tps = 309.281539 (without initial connection time)
if line.startswith("tps = ") and (
"(excluding connections establishing)" in line
or "(without initial connection time)" in line
or "(without initial connection time)"
):
tps = float(line.split()[2])
@@ -135,17 +137,6 @@ class PgBenchRunResult:
@dataclasses.dataclass
class PgBenchInitResult:
REGEX: ClassVar[re.Pattern] = re.compile( # type: ignore[type-arg]
r"done in (\d+\.\d+) s "
r"\("
r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
r"(?:create tables (\d+\.\d+) s)?(?:, )?"
r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
r"\)\."
)
total: float
drop_tables: Optional[float]
create_tables: Optional[float]
@@ -169,7 +160,18 @@ class PgBenchInitResult:
last_line = stderr.splitlines()[-1]
if (m := cls.REGEX.match(last_line)) is not None:
regex = re.compile(
r"done in (\d+\.\d+) s "
r"\("
r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
r"(?:create tables (\d+\.\d+) s)?(?:, )?"
r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
r"\)\."
)
if (m := regex.match(last_line)) is not None:
total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [
float(v) for v in m.groups() if v is not None
]
@@ -206,7 +208,7 @@ class NeonBenchmarker:
function by the zenbenchmark fixture
"""
def __init__(self, property_recorder: Callable[[str, object], None]):
def __init__(self, property_recorder):
# property recorder here is a pytest fixture provided by junitxml module
# https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
self.property_recorder = property_recorder
@@ -234,7 +236,7 @@ class NeonBenchmarker:
)
@contextmanager
def record_duration(self, metric_name: str) -> Iterator[None]:
def record_duration(self, metric_name: str):
"""
Record a duration. Usage:
@@ -335,21 +337,21 @@ class NeonBenchmarker:
f"{prefix}.{metric}", value, unit="s", report=MetricReport.LOWER_IS_BETTER
)
def get_io_writes(self, pageserver: NeonPageserver) -> int:
def get_io_writes(self, pageserver) -> int:
"""
Fetch the "cumulative # of bytes written" metric from the pageserver
"""
metric_name = r'libmetrics_disk_io_bytes_total{io_operation="write"}'
return self.get_int_counter_value(pageserver, metric_name)
def get_peak_mem(self, pageserver: NeonPageserver) -> int:
def get_peak_mem(self, pageserver) -> int:
"""
Fetch the "maxrss" metric from the pageserver
"""
metric_name = r"libmetrics_maxrss_kb"
return self.get_int_counter_value(pageserver, metric_name)
def get_int_counter_value(self, pageserver: NeonPageserver, metric_name: str) -> int:
def get_int_counter_value(self, pageserver, metric_name) -> int:
"""Fetch the value of given int counter from pageserver metrics."""
# TODO: If we start to collect more of the prometheus metrics in the
# performance test suite like this, we should refactor this to load and
@@ -363,9 +365,7 @@ class NeonBenchmarker:
assert matches, f"metric {metric_name} not found"
return int(round(float(matches.group(1))))
def get_timeline_size(
self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId
) -> int:
def get_timeline_size(self, repo_dir: Path, tenant_id: TenantId, timeline_id: TimelineId):
"""
Calculate the on-disk size of a timeline
"""
@@ -379,9 +379,7 @@ class NeonBenchmarker:
return totalbytes
@contextmanager
def record_pageserver_writes(
self, pageserver: NeonPageserver, metric_name: str
) -> Iterator[None]:
def record_pageserver_writes(self, pageserver, metric_name):
"""
Record bytes written by the pageserver during a test.
"""
@@ -398,7 +396,7 @@ class NeonBenchmarker:
@pytest.fixture(scope="function")
def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[NeonBenchmarker]:
def zenbenchmark(record_property) -> Iterator[NeonBenchmarker]:
"""
This is a python decorator for benchmark fixtures. It contains functions for
recording measurements, and prints them out at the end.
@@ -407,7 +405,7 @@ def zenbenchmark(record_property: Callable[[str, object], None]) -> Iterator[Neo
yield benchmarker
def pytest_addoption(parser: Parser):
def pytest_addoption(parser):
parser.addoption(
"--out-dir",
dest="out_dir",
@@ -431,9 +429,7 @@ def get_out_path(target_dir: Path, revision: str) -> Path:
# Hook to print the results at the end
@pytest.hookimpl(hookwrapper=True)
def pytest_terminal_summary(
terminalreporter: TerminalReporter, exitstatus: int, config: Config
) -> Iterator[None]:
def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
yield
revision = os.getenv("GITHUB_SHA", "local")
platform = os.getenv("PLATFORM", "local")

View File

@@ -1,11 +1,10 @@
from abc import ABC, abstractmethod
from contextlib import _GeneratorContextManager, contextmanager
from contextlib import contextmanager
# Type-related stuff
from typing import Dict, Iterator, List
from typing import Dict, List
import pytest
from _pytest.fixtures import FixtureRequest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
from fixtures.neon_fixtures import NeonEnv, PgBin, PgProtocol, RemotePostgres, VanillaPostgres
from fixtures.pg_stats import PgStatTable
@@ -29,20 +28,19 @@ class PgCompare(ABC):
pass
@property
@abstractmethod
def zenbenchmark(self) -> NeonBenchmarker:
pass
@abstractmethod
def flush(self):
def flush(self) -> None:
pass
@abstractmethod
def report_peak_memory_use(self):
def report_peak_memory_use(self) -> None:
pass
@abstractmethod
def report_size(self):
def report_size(self) -> None:
pass
@contextmanager
@@ -56,7 +54,7 @@ class PgCompare(ABC):
pass
@contextmanager
def record_pg_stats(self, pg_stats: List[PgStatTable]) -> Iterator[None]:
def record_pg_stats(self, pg_stats: List[PgStatTable]):
init_data = self._retrieve_pg_stats(pg_stats)
yield
@@ -86,11 +84,7 @@ class NeonCompare(PgCompare):
"""PgCompare interface for the neon stack."""
def __init__(
self,
zenbenchmark: NeonBenchmarker,
neon_simple_env: NeonEnv,
pg_bin: PgBin,
branch_name: str,
self, zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, branch_name
):
self.env = neon_simple_env
self._zenbenchmark = zenbenchmark
@@ -103,15 +97,15 @@ class NeonCompare(PgCompare):
self.timeline = self.pg.safe_psql("SHOW neon.timeline_id")[0][0]
@property
def pg(self) -> PgProtocol:
def pg(self):
return self._pg
@property
def zenbenchmark(self) -> NeonBenchmarker:
def zenbenchmark(self):
return self._zenbenchmark
@property
def pg_bin(self) -> PgBin:
def pg_bin(self):
return self._pg_bin
def flush(self):
@@ -120,7 +114,7 @@ class NeonCompare(PgCompare):
def compact(self):
self.pageserver_http_client.timeline_compact(self.env.initial_tenant, self.timeline)
def report_peak_memory_use(self):
def report_peak_memory_use(self) -> None:
self.zenbenchmark.record(
"peak_mem",
self.zenbenchmark.get_peak_mem(self.env.pageserver) / 1024,
@@ -128,7 +122,7 @@ class NeonCompare(PgCompare):
report=MetricReport.LOWER_IS_BETTER,
)
def report_size(self):
def report_size(self) -> None:
timeline_size = self.zenbenchmark.get_timeline_size(
self.env.repo_dir, self.env.initial_tenant, self.timeline
)
@@ -150,17 +144,17 @@ class NeonCompare(PgCompare):
"num_files_uploaded", total_files, "", report=MetricReport.LOWER_IS_BETTER
)
def record_pageserver_writes(self, out_name: str) -> _GeneratorContextManager[None]:
def record_pageserver_writes(self, out_name):
return self.zenbenchmark.record_pageserver_writes(self.env.pageserver, out_name)
def record_duration(self, out_name: str) -> _GeneratorContextManager[None]:
def record_duration(self, out_name):
return self.zenbenchmark.record_duration(out_name)
class VanillaCompare(PgCompare):
"""PgCompare interface for vanilla postgres."""
def __init__(self, zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres):
def __init__(self, zenbenchmark, vanilla_pg: VanillaPostgres):
self._pg = vanilla_pg
self._zenbenchmark = zenbenchmark
vanilla_pg.configure(
@@ -176,24 +170,24 @@ class VanillaCompare(PgCompare):
self.cur = self.conn.cursor()
@property
def pg(self) -> PgProtocol:
def pg(self):
return self._pg
@property
def zenbenchmark(self) -> NeonBenchmarker:
def zenbenchmark(self):
return self._zenbenchmark
@property
def pg_bin(self) -> PgBin:
def pg_bin(self):
return self._pg.pg_bin
def flush(self):
self.cur.execute("checkpoint")
def report_peak_memory_use(self):
def report_peak_memory_use(self) -> None:
pass # TODO find something
def report_size(self):
def report_size(self) -> None:
data_size = self.pg.get_subdir_size("base")
self.zenbenchmark.record(
"data_size", data_size / (1024 * 1024), "MB", report=MetricReport.LOWER_IS_BETTER
@@ -204,17 +198,17 @@ class VanillaCompare(PgCompare):
)
@contextmanager
def record_pageserver_writes(self, out_name: str) -> Iterator[None]:
def record_pageserver_writes(self, out_name):
yield # Do nothing
def record_duration(self, out_name: str) -> _GeneratorContextManager[None]:
def record_duration(self, out_name):
return self.zenbenchmark.record_duration(out_name)
class RemoteCompare(PgCompare):
"""PgCompare interface for a remote postgres instance."""
def __init__(self, zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres):
def __init__(self, zenbenchmark, remote_pg: RemotePostgres):
self._pg = remote_pg
self._zenbenchmark = zenbenchmark
@@ -223,60 +217,55 @@ class RemoteCompare(PgCompare):
self.cur = self.conn.cursor()
@property
def pg(self) -> PgProtocol:
def pg(self):
return self._pg
@property
def zenbenchmark(self) -> NeonBenchmarker:
def zenbenchmark(self):
return self._zenbenchmark
@property
def pg_bin(self) -> PgBin:
def pg_bin(self):
return self._pg.pg_bin
def flush(self):
# TODO: flush the remote pageserver
pass
def report_peak_memory_use(self):
def report_peak_memory_use(self) -> None:
# TODO: get memory usage from remote pageserver
pass
def report_size(self):
def report_size(self) -> None:
# TODO: get storage size from remote pageserver
pass
@contextmanager
def record_pageserver_writes(self, out_name: str) -> Iterator[None]:
def record_pageserver_writes(self, out_name):
yield # Do nothing
def record_duration(self, out_name: str) -> _GeneratorContextManager[None]:
def record_duration(self, out_name):
return self.zenbenchmark.record_duration(out_name)
@pytest.fixture(scope="function")
def neon_compare(
request: FixtureRequest,
zenbenchmark: NeonBenchmarker,
pg_bin: PgBin,
neon_simple_env: NeonEnv,
) -> NeonCompare:
def neon_compare(request, zenbenchmark, pg_bin, neon_simple_env) -> NeonCompare:
branch_name = request.node.name
return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name)
@pytest.fixture(scope="function")
def vanilla_compare(zenbenchmark: NeonBenchmarker, vanilla_pg: VanillaPostgres) -> VanillaCompare:
def vanilla_compare(zenbenchmark, vanilla_pg) -> VanillaCompare:
return VanillaCompare(zenbenchmark, vanilla_pg)
@pytest.fixture(scope="function")
def remote_compare(zenbenchmark: NeonBenchmarker, remote_pg: RemotePostgres) -> RemoteCompare:
def remote_compare(zenbenchmark, remote_pg) -> RemoteCompare:
return RemoteCompare(zenbenchmark, remote_pg)
@pytest.fixture(params=["vanilla_compare", "neon_compare"], ids=["vanilla", "neon"])
def neon_with_baseline(request: FixtureRequest) -> PgCompare:
def neon_with_baseline(request) -> PgCompare:
"""Parameterized fixture that helps compare neon against vanilla postgres.
A test that uses this fixture turns into a parameterized test that runs against:
@@ -297,6 +286,8 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare:
implementation-specific logic is widely useful across multiple tests, it might
make sense to add methods to the PgCompare class.
"""
fixture = request.getfixturevalue(request.param) # type: ignore
assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare"
return fixture
fixture = request.getfixturevalue(request.param)
if isinstance(fixture, PgCompare):
return fixture
else:
raise AssertionError(f"test error: fixture {request.param} is not PgCompare")

View File

@@ -1,5 +1,5 @@
from collections import defaultdict
from typing import Dict, List, Optional, Tuple
from typing import Dict, List
from prometheus_client.parser import text_string_to_metric_families
from prometheus_client.samples import Sample
@@ -23,13 +23,13 @@ class Metrics:
pass
return res
def query_one(self, name: str, filter: Optional[Dict[str, str]] = None) -> Sample:
res = self.query_all(name, filter or {})
def query_one(self, name: str, filter: Dict[str, str] = {}) -> Sample:
res = self.query_all(name, filter)
assert len(res) == 1, f"expected single sample for {name} {filter}, found {res}"
return res[0]
def parse_metrics(text: str, name: str = "") -> Metrics:
def parse_metrics(text: str, name: str = ""):
metrics = Metrics(name)
gen = text_string_to_metric_families(text)
for family in gen:
@@ -39,7 +39,7 @@ def parse_metrics(text: str, name: str = "") -> Metrics:
return metrics
PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
PAGESERVER_PER_TENANT_METRICS = [
"pageserver_current_logical_size",
"pageserver_current_physical_size",
"pageserver_getpage_reconstruct_seconds_bucket",
@@ -62,4 +62,4 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
"pageserver_wait_lsn_seconds_sum",
"pageserver_created_persistent_files_total",
"pageserver_written_persistent_bytes_total",
)
]

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,3 @@
from functools import cached_property
from typing import List
import pytest
@@ -14,7 +13,7 @@ class PgStatTable:
self.columns = columns
self.additional_query = filter_query
@cached_property
@property
def query(self) -> str:
return f"SELECT {','.join(self.columns)} FROM {self.table} {self.additional_query}"
@@ -56,5 +55,6 @@ def pg_stats_wal() -> List[PgStatTable]:
PgStatTable(
"pg_stat_wal",
["wal_records", "wal_fpi", "wal_bytes", "wal_buffers_full", "wal_write"],
"",
)
]

View File

@@ -1,8 +1,4 @@
from typing import Any, List
import pytest
from _pytest.config import Config
from _pytest.config.argparsing import Parser
"""
This plugin allows tests to be marked as slow using pytest.mark.slow. By default slow
@@ -13,15 +9,15 @@ Copied from here: https://docs.pytest.org/en/latest/example/simple.html
"""
def pytest_addoption(parser: Parser):
def pytest_addoption(parser):
parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")
def pytest_configure(config: Config):
def pytest_configure(config):
config.addinivalue_line("markers", "slow: mark test as slow to run")
def pytest_collection_modifyitems(config: Config, items: List[Any]):
def pytest_collection_modifyitems(config, items):
if config.getoption("--runslow"):
# --runslow given in cli: do not skip slow tests
return

View File

@@ -1,8 +1,6 @@
import random
from functools import total_ordering
from typing import Any, Type, TypeVar, Union
T = TypeVar("T", bound="Id")
from typing import Union
@total_ordering
@@ -19,35 +17,31 @@ class Lsn:
"""Convert lsn from hex notation to int."""
l, r = x.split("/")
self.lsn_int = (int(l, 16) << 32) + int(r, 16)
assert 0 <= self.lsn_int <= 0xFFFFFFFF_FFFFFFFF
# FIXME: error if it doesn't look like a valid LSN
def __str__(self) -> str:
def __str__(self):
"""Convert lsn from int to standard hex notation."""
return f"{(self.lsn_int >> 32):X}/{(self.lsn_int & 0xFFFFFFFF):X}"
return "{:X}/{:X}".format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF)
def __repr__(self) -> str:
return f'Lsn("{str(self)}")'
def __repr__(self):
return 'Lsn("{:X}/{:X}")'.format(self.lsn_int >> 32, self.lsn_int & 0xFFFFFFFF)
def __int__(self) -> int:
def __int__(self):
return self.lsn_int
def __lt__(self, other: Any) -> bool:
if not isinstance(other, Lsn):
return NotImplemented
def __lt__(self, other: "Lsn") -> bool:
return self.lsn_int < other.lsn_int
def __eq__(self, other: Any) -> bool:
def __eq__(self, other) -> bool:
if not isinstance(other, Lsn):
return NotImplemented
return self.lsn_int == other.lsn_int
# Returns the difference between two Lsns, in bytes
def __sub__(self, other: Any) -> int:
if not isinstance(other, Lsn):
return NotImplemented
def __sub__(self, other: "Lsn") -> int:
return self.lsn_int - other.lsn_int
def __hash__(self) -> int:
def __hash__(self):
return hash(self.lsn_int)
@@ -63,7 +57,7 @@ class Id:
self.id = bytearray.fromhex(x)
assert len(self.id) == 16
def __str__(self) -> str:
def __str__(self):
return self.id.hex()
def __lt__(self, other) -> bool:
@@ -76,20 +70,20 @@ class Id:
return NotImplemented
return self.id == other.id
def __hash__(self) -> int:
def __hash__(self):
return hash(str(self.id))
@classmethod
def generate(cls: Type[T]) -> T:
def generate(cls):
"""Generate a random ID"""
return cls(random.randbytes(16).hex())
class TenantId(Id):
def __repr__(self) -> str:
def __repr__(self):
return f'`TenantId("{self.id.hex()}")'
class TimelineId(Id):
def __repr__(self) -> str:
def __repr__(self):
return f'TimelineId("{self.id.hex()}")'

View File

@@ -6,7 +6,7 @@ import subprocess
import tarfile
import time
from pathlib import Path
from typing import Any, Callable, Dict, List, Tuple, TypeVar
from typing import Any, Callable, List, Tuple, TypeVar
import allure # type: ignore
from fixtures.log_helper import log
@@ -15,12 +15,12 @@ from psycopg2.extensions import cursor
Fn = TypeVar("Fn", bound=Callable[..., Any])
def get_self_dir() -> Path:
def get_self_dir() -> str:
"""Get the path to the directory where this script lives."""
return Path(__file__).resolve().parent
return os.path.dirname(os.path.abspath(__file__))
def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str:
def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
"""Run a process and capture its output
Output will go to files named "cmd_NNN.stdout" and "cmd_NNN.stderr"
@@ -30,11 +30,11 @@ def subprocess_capture(capture_dir: Path, cmd: List[str], **kwargs: Any) -> str:
If those files already exist, we will overwrite them.
Returns basepath for files with captured output.
"""
assert isinstance(cmd, list)
base = f"{os.path.basename(cmd[0])}_{global_counter()}"
assert type(cmd) is list
base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
basepath = os.path.join(capture_dir, base)
stdout_filename = f"{basepath}.stdout"
stderr_filename = f"{basepath}.stderr"
stdout_filename = basepath + ".stdout"
stderr_filename = basepath + ".stderr"
try:
with open(stdout_filename, "w") as stdout_f:
@@ -64,7 +64,7 @@ def global_counter() -> int:
return _global_counter
def print_gc_result(row: Dict[str, Any]):
def print_gc_result(row):
log.info("GC duration {elapsed} ms".format_map(row))
log.info(
" total: {layers_total}, needed_by_cutoff {layers_needed_by_cutoff}, needed_by_pitr {layers_needed_by_pitr}"
@@ -78,7 +78,8 @@ def etcd_path() -> Path:
path_output = shutil.which("etcd")
if path_output is None:
raise RuntimeError("etcd not found in PATH")
return Path(path_output)
else:
return Path(path_output)
def query_scalar(cur: cursor, query: str) -> Any:
@@ -123,6 +124,7 @@ def get_timeline_dir_size(path: Path) -> int:
# file is a delta layer
_ = parse_delta_layer(dir_entry.name)
sz += dir_entry.stat().st_size
continue
return sz
@@ -155,8 +157,8 @@ def get_scale_for_db(size_mb: int) -> int:
return round(0.06689 * size_mb - 0.5)
ATTACHMENT_NAME_REGEX: re.Pattern = re.compile( # type: ignore[type-arg]
r"flamegraph\.svg|regression\.diffs|.+\.(?:log|stderr|stdout|filediff|metrics|html)"
ATTACHMENT_NAME_REGEX = re.compile(
r".+\.log|.+\.stderr|.+\.stdout|.+\.filediff|.+\.metrics|flamegraph\.svg|regression\.diffs|.+\.html"
)

View File

@@ -1,22 +1,3 @@
# Running locally
First make a release build. The profiling flag is optional, used only for tests that
generate flame graphs. The `-s` flag just silences a lot of output, and makes it
easier to see if you have compile errors without scrolling up.
`BUILD_TYPE=release CARGO_BUILD_FLAGS="--features=testing,profiling" make -s -j8`
NOTE: the `profiling` flag only works on linux because we use linux-specific
libc APIs like `libc::timer_t`.
Then run the tests
`NEON_BIN=./target/release poetry run pytest test_runner/performance"`
Some handy pytest flags for local development:
- `-x` tells pytest to stop on first error
- `-s` shows test output
- `-k` selects a test to run
- `--timeout=0` disables our default timeout of 300s (see `setup.cfg`)
# What performance tests do we have and how we run them
Performance tests are built using the same infrastructure as our usual python integration tests. There are some extra fixtures that help to collect performance metrics, and to run tests against both vanilla PostgreSQL and Neon for comparison.

View File

@@ -4,7 +4,6 @@ from typing import List
from fixtures.benchmark_fixture import PgBenchRunResult
from fixtures.compare_fixtures import NeonCompare
from fixtures.neon_fixtures import fork_at_current_lsn
from performance.test_perf_pgbench import utc_now_timestamp
# -----------------------------------------------------------------------
@@ -44,8 +43,7 @@ def test_compare_child_and_root_pgbench_perf(neon_compare: NeonCompare):
pg_root = env.postgres.create_start("root")
pg_bin.run_capture(["pgbench", "-i", pg_root.connstr(), "-s10"])
fork_at_current_lsn(env, pg_root, "child", "root")
env.neon_cli.create_branch("child", "root")
pg_child = env.postgres.create_start("child")
run_pgbench_on_branch("root", ["pgbench", "-c10", "-T10", pg_root.connstr()])

View File

@@ -1,31 +0,0 @@
from contextlib import closing
from fixtures.neon_fixtures import NeonEnvBuilder
# This test demonstrates how to collect a read trace. It's useful until
# it gets replaced by a test that actually does stuff with the trace.
def test_read_request_tracing(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
env = neon_env_builder.init_start()
tenant, _ = env.neon_cli.create_tenant(
conf={
"trace_read_requests": "true",
}
)
timeline = env.neon_cli.create_timeline("test_trace_replay", tenant_id=tenant)
pg = env.postgres.create_start("test_trace_replay", "main", tenant)
with closing(pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("create table t (i integer);")
cur.execute(f"insert into t values (generate_series(1,{10000}));")
cur.execute("select count(*) from t;")
# Stop pg so we drop the connection and flush the traces
pg.stop()
trace_path = env.repo_dir / "traces" / str(tenant) / str(timeline)
assert trace_path.exists()

View File

@@ -6,7 +6,6 @@ import pytest
from fixtures.benchmark_fixture import MetricReport
from fixtures.compare_fixtures import PgCompare
from fixtures.log_helper import log
from pytest_lazyfixture import lazy_fixture # type: ignore
@pytest.mark.parametrize(
@@ -21,24 +20,11 @@ from pytest_lazyfixture import lazy_fixture # type: ignore
pytest.param(10000000, 1, 4),
],
)
@pytest.mark.parametrize(
"env, scale",
[
# Run on all envs. Use 50x larger table on remote cluster to make sure
# it doesn't fit in shared buffers, which are larger on remote than local.
pytest.param(lazy_fixture("neon_compare"), 1, id="neon"),
pytest.param(lazy_fixture("vanilla_compare"), 1, id="vanilla"),
pytest.param(
lazy_fixture("remote_compare"), 50, id="remote", marks=pytest.mark.remote_cluster
),
],
)
def test_seqscans(env: PgCompare, scale: int, rows: int, iters: int, workers: int):
rows = scale * rows
def test_seqscans(neon_with_baseline: PgCompare, rows: int, iters: int, workers: int):
env = neon_with_baseline
with closing(env.pg.connect()) as conn:
with conn.cursor() as cur:
cur.execute("drop table if exists t;")
cur.execute("create table t (i integer);")
cur.execute(f"insert into t values (generate_series(1,{rows}));")

View File

@@ -2,7 +2,7 @@ import statistics
import threading
import time
import timeit
from typing import Any, Callable, List
from typing import Callable
import pytest
from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
@@ -197,7 +197,7 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte
if not isinstance(env, NeonCompare):
return
lsn_write_lags: List[Any] = []
lsn_write_lags = []
last_received_lsn = Lsn(0)
last_pg_flush_lsn = Lsn(0)
@@ -216,7 +216,6 @@ def record_lsn_write_lag(env: PgCompare, run_cond: Callable[[], bool], pool_inte
)
res = cur.fetchone()
assert isinstance(res, tuple)
lsn_write_lags.append(res[0])
curr_received_lsn = Lsn(res[3])

View File

@@ -24,6 +24,7 @@ if __name__ == "__main__":
if (v := os.environ.get(k, None)) is not None
}
row = asyncio.run(run(**kwargs))
loop = asyncio.new_event_loop()
row = loop.run_until_complete(run(**kwargs))
print(row[0])

View File

@@ -46,9 +46,9 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st
raise RuntimeError("docker is required for running this test")
build_cmd = [docker_bin, "build", "--tag", image_tag, f"{Path(__file__).parent / client}"]
subprocess_capture(test_output_dir, build_cmd, check=True)
subprocess_capture(str(test_output_dir), build_cmd, check=True)
run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag]
basepath = subprocess_capture(test_output_dir, run_cmd, check=True)
basepath = subprocess_capture(str(test_output_dir), run_cmd, check=True)
assert Path(f"{basepath}.stdout").read_text().strip() == "1"

View File

@@ -116,13 +116,6 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
env = neon_simple_env
pageserver_http_client = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
[
".*invalid branch start lsn: less than latest GC cutoff.*",
".*invalid branch start lsn: less than planned GC cutoff.*",
]
)
# Disable background GC but set the `pitr_interval` to be small, so GC can delete something
tenant, _ = env.neon_cli.create_tenant(
conf={

View File

@@ -13,9 +13,6 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.append(".*invalid branch start lsn.*")
env.pageserver.allowed_errors.append(".*invalid start lsn .* for ancestor timeline.*")
# Branch at the point where only 100 rows were inserted
env.neon_cli.create_branch("test_branch_behind")
pgmain = env.postgres.create_start("test_branch_behind")

Some files were not shown because too many files have changed in this diff Show More