Compare commits

..

4 Commits

Author SHA1 Message Date
Bojan Serafimov
0705c99fdb Try larger sleep 2022-08-12 09:52:40 -04:00
Bojan Serafimov
21089d5217 Wait for pid death 2022-08-12 09:21:44 -04:00
Bojan Serafimov
bd33ea9fae Add hacky solution 2022-08-12 09:05:51 -04:00
Bojan Serafimov
414279726d Reproduce pageserver.pid lock on restart issue 2022-08-12 09:01:17 -04:00
61 changed files with 1262 additions and 1572 deletions

View File

@@ -29,12 +29,8 @@ runs:
time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd .
elif [ -f ${SOURCE} ]; then
time tar -cf ${ARCHIVE} --zstd ${SOURCE}
elif ! ls ${SOURCE} > /dev/null 2>&1; then
echo 2>&1 "${SOURCE} does not exist"
exit 2
else
echo 2>&1 "${SOURCE} is neither a directory nor a file, do not know how to handle it"
exit 3
echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it"
fi
- name: Upload artifact

View File

@@ -2,14 +2,30 @@
set -e
if [ -n "${DOCKER_TAG}" ]; then
# Verson is DOCKER_TAG but without prefix
VERSION=$(echo $DOCKER_TAG | sed 's/^.*-//g')
RELEASE=${RELEASE:-false}
# look at docker hub for latest tag for neon docker image
if [ "${RELEASE}" = "true" ]; then
echo "search latest release tag"
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
if [ -z "${VERSION}" ]; then
echo "no any docker tags found, exiting..."
exit 1
else
TAG="release-${VERSION}"
fi
else
echo "Please set DOCKER_TAG environment variable"
exit 1
echo "search latest dev tag"
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
if [ -z "${VERSION}" ]; then
echo "no any docker tags found, exiting..."
exit 1
else
TAG="${VERSION}"
fi
fi
echo "found ${VERSION}"
# do initial cleanup
rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
@@ -17,8 +33,8 @@ mkdir neon_install
# retrieve binaries from docker image
echo "getting binaries from docker image"
docker pull --quiet neondatabase/neon:${DOCKER_TAG}
ID=$(docker create neondatabase/neon:${DOCKER_TAG})
docker pull --quiet neondatabase/neon:${TAG}
ID=$(docker create neondatabase/neon:${TAG})
docker cp ${ID}:/data/postgres_install.tar.gz .
tar -xzf postgres_install.tar.gz -C neon_install
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/

View File

@@ -1,4 +1,4 @@
name: Benchmarking
name: benchmarking
on:
# uncomment to run on push for debugging your PR
@@ -15,15 +15,6 @@ on:
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
jobs:
bench:
# this workflow runs on self hosteed runner
@@ -69,6 +60,7 @@ jobs:
- name: Setup cluster
env:
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
shell: bash -euxo pipefail {0}
run: |
set -e
@@ -104,9 +96,7 @@ jobs:
# since it might generate duplicates when calling ingest_perf_test_result.py
rm -rf perf-report-staging
mkdir -p perf-report-staging
# Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
# it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
- name: Submit result
env:
@@ -123,106 +113,3 @@ jobs:
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
pgbench-compare:
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
TEST_PG_BENCH_SCALES_MATRIX: "10gb"
REMOTE_ENV: "1"
POSTGRES_DISTRIB_DIR: /usr
TEST_OUTPUT: /tmp/test_output
strategy:
fail-fast: false
matrix:
connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ]
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2817580636
timeout-minutes: 360 # 6h
steps:
- uses: actions/checkout@v3
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: Calculate platform
id: calculate-platform
env:
CONNSTR: ${{ matrix.connstr }}
run: |
if [ "${CONNSTR}" = "BENCHMARK_CAPTEST_CONNSTR" ]; then
PLATFORM=neon-captest
elif [ "${CONNSTR}" = "BENCHMARK_RDS_CONNSTR" ]; then
PLATFORM=rds-aurora
else
echo 2>&1 "Unknown CONNSTR=${CONNSTR}. Allowed are BENCHMARK_CAPTEST_CONNSTR, and BENCHMARK_RDS_CONNSTR only"
exit 1
fi
echo "::set-output name=PLATFORM::${PLATFORM}"
- name: Install Deps
run: |
echo "deb http://apt.postgresql.org/pub/repos/apt focal-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt -y update
sudo apt install -y postgresql-14 postgresql-client-14
- name: Benchmark init
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
run: |
mkdir -p perf-report-captest
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
- name: Benchmark simple-update
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
run: |
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
- name: Benchmark select-only
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
run: |
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
- name: Submit result
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
run: |
REPORT_FROM=$(realpath perf-report-captest) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
- name: Upload logs
if: always()
uses: ./.github/actions/upload
with:
name: bench-captest-${{ steps.calculate-platform.outputs.PLATFORM }}
path: /tmp/test_output/
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -7,6 +7,10 @@ on:
- release
pull_request:
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
@@ -17,39 +21,9 @@ env:
COPT: '-Werror'
jobs:
tag:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
fetch-depth: 0
- name: Get build tag
run: |
echo run:$GITHUB_RUN_ID
echo ref:$GITHUB_REF_NAME
echo rev:$(git rev-list --count HEAD)
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::$(git rev-list --count HEAD)"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
echo "::set-output name=tag::$GITHUB_RUN_ID"
fi
shell: bash
id: build-tag
build-neon:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
strategy:
fail-fast: false
matrix:
@@ -61,7 +35,7 @@ jobs:
GIT_VERSION: ${{ github.sha }}
steps:
- name: Fix git ownership
- name: Fix git ownerwhip
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
@@ -80,7 +54,6 @@ jobs:
- name: Set pg revision for caching
id: pg_ver
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
shell: bash -euxo pipefail {0}
# Set some environment variables used by all the steps.
#
@@ -104,7 +77,6 @@ jobs:
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
shell: bash -euxo pipefail {0}
# Don't include the ~/.cargo/registry/src directory. It contains just
# uncompressed versions of the crates in ~/.cargo/registry/cache
@@ -121,8 +93,8 @@ jobs:
target/
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
key: |
v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
- name: Cache postgres build
id: cache_pg
@@ -134,17 +106,14 @@ jobs:
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: mold -run make postgres -j$(nproc)
shell: bash -euxo pipefail {0}
- name: Run cargo build
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
shell: bash -euxo pipefail {0}
- name: Run cargo test
run: |
${cov_prefix} cargo test $CARGO_FLAGS
shell: bash -euxo pipefail {0}
- name: Install rust binaries
run: |
@@ -185,11 +154,9 @@ jobs:
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
fi
shell: bash -euxo pipefail {0}
- name: Install postgres binaries
run: cp -a tmp_install /tmp/neon/pg_install
shell: bash -euxo pipefail {0}
- name: Upload Neon artifact
uses: ./.github/actions/upload
@@ -204,9 +171,7 @@ jobs:
pg_regress-tests:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ build-neon ]
strategy:
fail-fast: false
@@ -234,9 +199,7 @@ jobs:
other-tests:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ build-neon ]
strategy:
fail-fast: false
@@ -267,9 +230,7 @@ jobs:
benchmarks:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ build-neon ]
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
@@ -300,9 +261,7 @@ jobs:
coverage-report:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ other-tests, pg_regress-tests ]
strategy:
fail-fast: false
@@ -325,7 +284,7 @@ jobs:
!~/.cargo/registry/src
~/.cargo/git/
target/
key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
@@ -341,7 +300,6 @@ jobs:
- name: Merge coverage data
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
shell: bash -euxo pipefail {0}
- name: Build and upload coverage report
run: |
@@ -374,13 +332,9 @@ jobs:
\"description\": \"Coverage report is ready\",
\"target_url\": \"$REPORT_URL\"
}"
shell: bash -euxo pipefail {0}
trigger-e2e-tests:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ build-neon ]
steps:
- name: Set PR's status to pending and request a remote CI test
@@ -415,160 +369,150 @@ jobs:
}
}"
dockerfile-check:
if: github.event_name != 'workflow_dispatch'
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
docker-image:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ pg_regress-tests, other-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
value: ${{ steps.dockerfile-check.outputs.any_changed }}
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Get specific changed files
id: dockerfile-check
uses: tj-actions/changed-files@802732316a11c01531ea72773ec7998155238e31 # v25
with:
files: |
Dockerfile
Dockerfile.compute-tools
./vendor/postgres/Dockerfile
neon-image:
# force building for all 3 images
if: needs.dockerfile-check.outputs.value != 'true'
runs-on: dev
needs: [ dockerfile-check ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
environment: dev
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
with:
submodules: true
fetch-depth: 0
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Kaniko build console
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
with:
driver: docker
compute-tools-image:
if: needs.dockerfile-check.outputs.value != 'true'
runs-on: dev
needs: [ dockerfile-check ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
environment: dev
- name: Get build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::$(git rev-list --count HEAD)"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: build-tag
- name: Get legacy build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::latest"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: legacy-build-tag
- name: Build neon Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
pull: true
push: true
tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
docker-image-compute:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ pg_regress-tests, other-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build console
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID
compute-node-image:
if: needs.dockerfile-check.outputs.value != 'true'
runs-on: dev
needs: [ dockerfile-check ]
container: gcr.io/kaniko-project/executor:v1.9.0-debug
environment: dev
steps:
- name: Checkout
uses: actions/checkout@v1 # v3 won't work with kaniko
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Kaniko build console
working-directory: ./vendor/postgres/
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
with:
driver: docker
promote-images:
runs-on: dev
needs: [ neon-image, compute-tools-image, compute-node-image ]
if: github.event_name != 'workflow_dispatch'
container: amazon/aws-cli
strategy:
fail-fast: false
matrix:
name: [ neon, compute-tools, compute-node ]
steps:
- name: Promote image to latest
run:
MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=$GITHUB_RUN_ID --query 'images[].imageManifest' --output text) && aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
push-docker-hub:
runs-on: dev
needs: [ promote-images, tag ]
container: golang:1.19-bullseye
environment: dev
steps:
- name: Install Crane & ECR helper
- name: Get build tag
run: |
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
# - name: Get build tag
# run: |
# if [[ "$GITHUB_REF_NAME" == "main" ]]; then
# echo "::set-output name=tag::$(git rev-list --count HEAD)"
# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
# echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
# else
# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' "
# echo "::set-output name=tag::$GITHUB_RUN_ID"
# fi
# id: build-tag
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::$(git rev-list --count HEAD)"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: build-tag
- name: Configure ECR login
- name: Get legacy build tag
run: |
mkdir /github/home/.docker/
echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::latest"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: legacy-build-tag
- name: Pull neon image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:latest neon
- name: Build compute-tools Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
push: false
file: Dockerfile.compute-tools
tags: neondatabase/compute-tools:local
- name: Pull compute tools image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:latest compute-tools
- name: Push compute-tools Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
push: true
file: Dockerfile.compute-tools
tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
- name: Pull compute node image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
- name: Configure docker login
run: |
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
echo "" > /github/home/.docker/config.json
crane auth login -u ${{ secrets.NEON_DOCKERHUB_USERNAME }} -p ${{ secrets.NEON_DOCKERHUB_PASSWORD }} index.docker.io
- name: Push neon image to Docker Hub
run: crane push neon neondatabase/neon:${{needs.tag.outputs.build-tag}}
- name: Push compute tools image to Docker Hub
run: crane push compute-tools neondatabase/compute-tools:${{needs.tag.outputs.build-tag}}
- name: Push compute node image to Docker Hub
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
- name: Add latest tag to images
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
run: |
crane tag neondatabase/neon:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-tools:${{needs.tag.outputs.build-tag}} latest
crane tag neondatabase/compute-node:${{needs.tag.outputs.build-tag}} latest
- name: Build compute-node Docker image
uses: docker/build-push-action@v2
with:
context: ./vendor/postgres/
build-args:
COMPUTE_TOOLS_TAG=local
push: true
tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
calculate-deploy-targets:
runs-on: [ self-hosted, Linux, k8s-runner ]
@@ -594,16 +538,14 @@ jobs:
deploy:
runs-on: [ self-hosted, Linux, k8s-runner ]
#container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag ]
# We need both storage **and** compute images for deploy, because control plane
# picks the compute version based on the storage version. If it notices a fresh
# storage it may bump the compute version. And if compute image failed to build
# it may break things badly.
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
@@ -614,19 +556,12 @@ jobs:
submodules: true
fetch-depth: 0
- name: Setup python
uses: actions/setup-python@v4
with:
python-version: '3.10'
- name: Setup ansible
run: |
export PATH="/root/.local/bin:$PATH"
pip install --progress-bar off --user ansible boto3
- name: Redeploy
run: |
export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
@@ -649,16 +584,13 @@ jobs:
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag ]
runs-on: [ self-hosted, Linux, k8s-runner ]
# Compute image isn't strictly required for proxy deploy, but let's still wait for it
# to run all deploy jobs consistently.
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
defaults:
run:
shell: bash
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
@@ -671,9 +603,6 @@ jobs:
submodules: true
fetch-depth: 0
- name: Add curl
run: apt update && apt install curl -y
- name: Store kubeconfig file
run: |
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
@@ -686,6 +615,6 @@ jobs:
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

View File

@@ -101,7 +101,7 @@ jobs:
!~/.cargo/registry/src
~/.cargo/git
target
key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
- name: Run cargo clippy
run: ./run_clippy.sh

View File

@@ -19,12 +19,8 @@ concurrency:
jobs:
test-postgres-client-libs:
# TODO: switch to gen2 runner, requires docker
runs-on: [ ubuntu-latest ]
env:
TEST_OUTPUT: /tmp/test_output
steps:
- name: Checkout
uses: actions/checkout@v3
@@ -51,7 +47,7 @@ jobs:
env:
REMOTE_ENV: 1
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
TEST_OUTPUT: /tmp/test_output
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
shell: bash -euxo pipefail {0}
run: |
@@ -65,18 +61,9 @@ jobs:
-m "remote_cluster" \
-rA "test_runner/pg_clients"
# We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
# It will be fixed after switching to gen2 runner
- name: Upload python test logs
if: always()
uses: actions/upload-artifact@v3
with:
retention-days: 7
name: python-test-pg_clients-${{ runner.os }}-stage-logs
path: ${{ env.TEST_OUTPUT }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
if: failure()
id: slack
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream

1
Cargo.lock generated
View File

@@ -2269,7 +2269,6 @@ dependencies = [
"anyhow",
"async-trait",
"base64",
"bstr",
"bytes",
"clap 3.2.12",
"futures",

View File

@@ -1,4 +1,5 @@
use std::io::Write;
use std::net::TcpStream;
use std::path::PathBuf;
use std::process::Command;
use std::sync::Arc;
@@ -240,23 +241,37 @@ impl SafekeeperNode {
),
}
// Wait until process is gone
for i in 0..600 {
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};
let address = connection_address(&self.pg_connection_config);
// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if safekeeper flushes a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nSafekeeper connection failed with error: {err}");
}
}
}
if tcp_stopped {
// Also check status on the HTTP port
match self.check_status() {
Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
}
Err(err) => {
eprintln!("\nSafekeeper status check failed with error: {err}");
return Ok(());
}
Ok(()) => {
// keep waiting
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();

View File

@@ -1,6 +1,7 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Write};
use std::net::TcpStream;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::Command;
@@ -311,23 +312,38 @@ impl PageServerNode {
),
}
// Wait until process is gone
for i in 0..600 {
let signal = None; // Send no signal, just get the error code
match kill(pid, signal) {
Ok(_) => (), // Process exists, keep waiting
Err(Errno::ESRCH) => {
// Process not found, we're done
println!("done!");
return Ok(());
}
Err(err) => bail!(
"Failed to send signal to pageserver with pid {}: {}",
pid,
err.desc()
),
};
let address = connection_address(&self.pg_connection_config);
// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if pageserver checkpoints a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nPageserver connection failed with error: {err}");
}
}
}
if tcp_stopped {
// Also check status on the HTTP port
match self.check_status() {
Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
}
Err(err) => {
eprintln!("\nPageserver status check failed with error: {err}");
return Ok(());
}
Ok(()) => {
// keep waiting
}
}
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();

View File

@@ -44,7 +44,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
fn main() {
// Tell cargo to invalidate the built crate whenever the wrapper changes
println!("cargo:rerun-if-changed=bindgen_deps.h");
println!("cargo:rerun-if-changed=pg_control_ffi.h");
// Finding the location of C headers for the Postgres server:
// - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
@@ -88,9 +88,9 @@ fn main() {
// the resulting bindings.
let bindings = bindgen::Builder::default()
//
// All the needed PostgreSQL headers are included from 'bindgen_deps.h'
// All the needed PostgreSQL headers are included from 'pg_control_ffi.h'
//
.header("bindgen_deps.h")
.header("pg_control_ffi.h")
//
// Tell cargo to invalidate the built crate whenever any of the
// included header files changed.

View File

@@ -5,7 +5,7 @@ use crate::pg_constants;
use once_cell::sync::OnceCell;
use regex::Regex;
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
#[derive(Debug, Clone, thiserror::Error, PartialEq)]
pub enum FilePathError {
#[error("invalid relation fork name")]
InvalidForkName,

View File

@@ -16,22 +16,22 @@ use crate::XLogRecord;
use crate::XLOG_PAGE_MAGIC;
use crate::pg_constants::WAL_SEGMENT_SIZE;
use crate::waldecoder::WalStreamDecoder;
use anyhow::{anyhow, bail, ensure};
use byteorder::{ByteOrder, LittleEndian};
use bytes::BytesMut;
use bytes::{Buf, Bytes};
use crc32c::*;
use log::*;
use std::fs::File;
use std::cmp::max;
use std::cmp::min;
use std::fs::{self, File};
use std::io::prelude::*;
use std::io::ErrorKind;
use std::io::SeekFrom;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use utils::bin_ser::DeserializeError;
use utils::bin_ser::SerializeError;
use utils::const_assert;
use utils::lsn::Lsn;
pub const XLOG_FNAME_LEN: usize = 24;
@@ -80,12 +80,12 @@ pub fn XLogSegNoOffsetToRecPtr(
#[allow(non_snake_case)]
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
format!(
return format!(
"{:>08X}{:>08X}{:>08X}",
tli,
logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes),
logSegNo % XLogSegmentsPerXLogId(wal_segsz_bytes)
)
);
}
#[allow(non_snake_case)]
@@ -140,93 +140,338 @@ pub fn to_pg_timestamp(time: SystemTime) -> TimestampTz {
}
}
// Returns (aligned) end_lsn of the last record in data_dir with WAL segments.
// start_lsn must point to some previously known record boundary (beginning of
// the next record). If no valid record after is found, start_lsn is returned
// back.
pub fn find_end_of_wal(
/// Return offset of the last valid record in the segment segno, starting
/// looking at start_offset. Returns start_offset if no records found.
fn find_end_of_wal_segment(
data_dir: &Path,
segno: XLogSegNo,
tli: TimeLineID,
wal_seg_size: usize,
start_lsn: Lsn, // start reading WAL at this point; must point at record start_lsn.
) -> anyhow::Result<Lsn> {
let mut result = start_lsn;
let mut curr_lsn = start_lsn;
start_offset: usize, // start reading at this point
) -> anyhow::Result<u32> {
// step back to the beginning of the page to read it in...
let mut offs: usize = start_offset - start_offset % XLOG_BLCKSZ;
let mut skipping_first_contrecord: bool = false;
let mut contlen: usize = 0;
let mut xl_crc: u32 = 0;
let mut crc: u32 = 0;
let mut rec_offs: usize = 0;
let mut buf = [0u8; XLOG_BLCKSZ];
let mut decoder = WalStreamDecoder::new(start_lsn);
let file_name = XLogFileName(tli, segno, wal_seg_size);
let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
file.seek(SeekFrom::Start(offs as u64))?;
// xl_crc is the last field in XLogRecord, will not be read into rec_hdr
const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
let mut rec_hdr = [0u8; XLOG_RECORD_CRC_OFFS];
// loop over segments
loop {
let segno = curr_lsn.segment_number(wal_seg_size);
let seg_file_name = XLogFileName(PG_TLI, segno, wal_seg_size);
let seg_file_path = data_dir.join(seg_file_name);
match open_wal_segment(&seg_file_path)? {
None => {
// no more segments
info!(
"find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
result, seg_file_path
trace!("find_end_of_wal_segment(data_dir={}, segno={}, tli={}, wal_seg_size={}, start_offset=0x{:x})", data_dir.display(), segno, tli, wal_seg_size, start_offset);
while offs < wal_seg_size {
// we are at the beginning of the page; read it in
if offs % XLOG_BLCKSZ == 0 {
trace!("offs=0x{:x}: new page", offs);
let bytes_read = file.read(&mut buf)?;
if bytes_read != buf.len() {
bail!(
"failed to read {} bytes from {} at {}",
XLOG_BLCKSZ,
file_name,
offs
);
return Ok(result);
}
Some(mut segment) => {
let seg_offs = curr_lsn.segment_offset(wal_seg_size);
segment.seek(SeekFrom::Start(seg_offs as u64))?;
// loop inside segment
loop {
let bytes_read = segment.read(&mut buf)?;
if bytes_read == 0 {
break; // EOF
}
curr_lsn += bytes_read as u64;
decoder.feed_bytes(&buf[0..bytes_read]);
// advance result past all completely read records
loop {
match decoder.poll_decode() {
Ok(Some(record)) => result = record.0,
Err(e) => {
info!(
"find_end_of_wal reached end at {:?}, decode error: {:?}",
result, e
);
return Ok(result);
}
Ok(None) => break, // need more data
}
let xlp_magic = LittleEndian::read_u16(&buf[0..2]);
let xlp_info = LittleEndian::read_u16(&buf[2..4]);
let xlp_rem_len = LittleEndian::read_u32(&buf[XLP_REM_LEN_OFFS..XLP_REM_LEN_OFFS + 4]);
trace!(
" xlp_magic=0x{:x}, xlp_info=0x{:x}, xlp_rem_len={}",
xlp_magic,
xlp_info,
xlp_rem_len
);
// this is expected in current usage when valid WAL starts after page header
if xlp_magic != XLOG_PAGE_MAGIC as u16 {
trace!(
" invalid WAL file {}.partial magic {} at {:?}",
file_name,
xlp_magic,
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
);
}
if offs == 0 {
offs += XLOG_SIZE_OF_XLOG_LONG_PHD;
if (xlp_info & XLP_FIRST_IS_CONTRECORD) != 0 {
trace!(" first record is contrecord");
skipping_first_contrecord = true;
contlen = xlp_rem_len as usize;
if offs < start_offset {
// Pre-condition failed: the beginning of the segment is unexpectedly corrupted.
ensure!(start_offset - offs >= contlen,
"start_offset is in the middle of the first record (which happens to be a contrecord), \
expected to be on a record boundary. Is beginning of the segment corrupted?");
contlen = 0;
// keep skipping_first_contrecord to avoid counting the contrecord as valid, we did not check it.
}
} else {
trace!(" first record is not contrecord");
}
} else {
offs += XLOG_SIZE_OF_XLOG_SHORT_PHD;
}
// ... and step forward again if asked
trace!(" skipped header to 0x{:x}", offs);
offs = max(offs, start_offset);
// beginning of the next record
} else if contlen == 0 {
let page_offs = offs % XLOG_BLCKSZ;
let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
trace!("offs=0x{:x}: new record, xl_tot_len={}", offs, xl_tot_len);
if xl_tot_len == 0 {
info!(
"find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
Lsn(XLogSegNoOffsetToRecPtr(
segno,
last_valid_rec_pos as u32,
wal_seg_size
))
);
break; // zeros, reached the end
}
if skipping_first_contrecord {
skipping_first_contrecord = false;
trace!(" first contrecord has been just completed");
} else {
trace!(
" updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
last_valid_rec_pos,
offs
);
last_valid_rec_pos = offs;
}
offs += 4;
rec_offs = 4;
contlen = xl_tot_len - 4;
trace!(
" reading rec_hdr[0..4] <-- [0x{:x}; 0x{:x})",
page_offs,
page_offs + 4
);
rec_hdr[0..4].copy_from_slice(&buf[page_offs..page_offs + 4]);
} else {
// we're continuing a record, possibly from previous page.
let page_offs = offs % XLOG_BLCKSZ;
let pageleft = XLOG_BLCKSZ - page_offs;
// read the rest of the record, or as much as fits on this page.
let n = min(contlen, pageleft);
trace!(
"offs=0x{:x}, record continuation, pageleft={}, contlen={}",
offs,
pageleft,
contlen
);
// fill rec_hdr header up to (but not including) xl_crc field
trace!(
" rec_offs={}, XLOG_RECORD_CRC_OFFS={}, XLOG_SIZE_OF_XLOG_RECORD={}",
rec_offs,
XLOG_RECORD_CRC_OFFS,
XLOG_SIZE_OF_XLOG_RECORD
);
if rec_offs < XLOG_RECORD_CRC_OFFS {
let len = min(XLOG_RECORD_CRC_OFFS - rec_offs, n);
trace!(
" reading rec_hdr[{}..{}] <-- [0x{:x}; 0x{:x})",
rec_offs,
rec_offs + len,
page_offs,
page_offs + len
);
rec_hdr[rec_offs..rec_offs + len].copy_from_slice(&buf[page_offs..page_offs + len]);
}
if rec_offs <= XLOG_RECORD_CRC_OFFS && rec_offs + n >= XLOG_SIZE_OF_XLOG_RECORD {
let crc_offs = page_offs - rec_offs + XLOG_RECORD_CRC_OFFS;
// All records are aligned on 8-byte boundary, so their 8-byte frames
// cannot be split between pages. As xl_crc is the last field,
// its content is always on the same page.
const_assert!(XLOG_RECORD_CRC_OFFS % 8 == 4);
// We should always start reading aligned records even in incorrect WALs so if
// the condition is false it is likely a bug. However, it is localized somewhere
// in this function, hence we do not crash and just report failure instead.
ensure!(crc_offs % 8 == 4, "Record is not aligned properly (bug?)");
xl_crc = LittleEndian::read_u32(&buf[crc_offs..crc_offs + 4]);
trace!(
" reading xl_crc: [0x{:x}; 0x{:x}) = 0x{:x}",
crc_offs,
crc_offs + 4,
xl_crc
);
crc = crc32c_append(0, &buf[crc_offs + 4..page_offs + n]);
trace!(
" initializing crc: [0x{:x}; 0x{:x}); crc = 0x{:x}",
crc_offs + 4,
page_offs + n,
crc
);
} else if rec_offs > XLOG_RECORD_CRC_OFFS {
// As all records are 8-byte aligned, the header is already fully read and `crc` is initialized in the branch above.
ensure!(rec_offs >= XLOG_SIZE_OF_XLOG_RECORD);
let old_crc = crc;
crc = crc32c_append(crc, &buf[page_offs..page_offs + n]);
trace!(
" appending to crc: [0x{:x}; 0x{:x}); 0x{:x} --> 0x{:x}",
page_offs,
page_offs + n,
old_crc,
crc
);
} else {
// Correct because of the way conditions are written above.
assert!(rec_offs + n < XLOG_SIZE_OF_XLOG_RECORD);
// If `skipping_first_contrecord == true`, we may be reading from a middle of a record
// which started in the previous segment. Hence there is no point in validating the header.
if !skipping_first_contrecord && rec_offs + n > XLOG_RECORD_CRC_OFFS {
info!(
"Curiously corrupted WAL: a record stops inside the header; \
offs=0x{:x}, record continuation, pageleft={}, contlen={}",
offs, pageleft, contlen
);
break;
}
// Do nothing: we are still reading the header. It's accounted in CRC in the end of the record.
}
rec_offs += n;
offs += n;
contlen -= n;
if contlen == 0 {
trace!(" record completed at 0x{:x}", offs);
crc = crc32c_append(crc, &rec_hdr);
offs = (offs + 7) & !7; // pad on 8 bytes boundary */
trace!(
" padded offs to 0x{:x}, crc is {:x}, expected crc is {:x}",
offs,
crc,
xl_crc
);
if skipping_first_contrecord {
// do nothing, the flag will go down on next iteration when we're reading new record
trace!(" first conrecord has been just completed");
} else if crc == xl_crc {
// record is valid, advance the result to its end (with
// alignment to the next record taken into account)
trace!(
" updating last_valid_rec_pos: 0x{:x} --> 0x{:x}",
last_valid_rec_pos,
offs
);
last_valid_rec_pos = offs;
} else {
info!(
"CRC mismatch {} vs {} at {}",
crc, xl_crc, last_valid_rec_pos
);
break;
}
}
}
}
trace!("last_valid_rec_pos=0x{:x}", last_valid_rec_pos);
Ok(last_valid_rec_pos as u32)
}
// Open .partial or full WAL segment file, if present.
fn open_wal_segment(seg_file_path: &Path) -> anyhow::Result<Option<File>> {
let mut partial_path = seg_file_path.to_owned();
partial_path.set_extension("partial");
match File::open(partial_path) {
Ok(file) => Ok(Some(file)),
Err(e) => match e.kind() {
ErrorKind::NotFound => {
// .partial not found, try full
match File::open(seg_file_path) {
Ok(file) => Ok(Some(file)),
Err(e) => match e.kind() {
ErrorKind::NotFound => Ok(None),
_ => Err(e.into()),
},
}
}
_ => Err(e.into()),
},
///
/// Scan a directory that contains PostgreSQL WAL files, for the end of WAL.
/// If precise, returns end LSN (next insertion point, basically);
/// otherwise, start of the last segment.
/// Returns (0, 0) if there is no WAL.
///
pub fn find_end_of_wal(
data_dir: &Path,
wal_seg_size: usize,
precise: bool,
start_lsn: Lsn, // start reading WAL at this point or later
) -> anyhow::Result<(XLogRecPtr, TimeLineID)> {
let mut high_segno: XLogSegNo = 0;
let mut high_tli: TimeLineID = 0;
let mut high_ispartial = false;
for entry in fs::read_dir(data_dir)?.flatten() {
let ispartial: bool;
let entry_name = entry.file_name();
let fname = entry_name
.to_str()
.ok_or_else(|| anyhow!("Invalid file name"))?;
/*
* Check if the filename looks like an xlog file, or a .partial file.
*/
if IsXLogFileName(fname) {
ispartial = false;
} else if IsPartialXLogFileName(fname) {
ispartial = true;
} else {
continue;
}
let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
continue;
}
if segno > high_segno
|| (segno == high_segno && tli > high_tli)
|| (segno == high_segno && tli == high_tli && high_ispartial && !ispartial)
{
high_segno = segno;
high_tli = tli;
high_ispartial = ispartial;
}
}
if high_segno > 0 {
let mut high_offs = 0;
/*
* Move the starting pointer to the start of the next segment, if the
* highest one we saw was completed.
*/
if !high_ispartial {
high_segno += 1;
} else if precise {
/* otherwise locate last record in last partial segment */
if start_lsn.segment_number(wal_seg_size) > high_segno {
bail!(
"provided start_lsn {:?} is beyond highest segno {:?} available",
start_lsn,
high_segno,
);
}
let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno {
start_lsn.segment_offset(wal_seg_size)
} else {
0
};
high_offs = find_end_of_wal_segment(
data_dir,
high_segno,
high_tli,
wal_seg_size,
start_offset,
)?;
}
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
return Ok((high_ptr, high_tli));
}
Ok((0, 0))
}
pub fn main() {
let mut data_dir = PathBuf::new();
data_dir.push(".");
let wal_end = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, Lsn(0)).unwrap();
println!("wal_end={:?}", wal_end);
let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap();
println!(
"wal_end={:>08X}{:>08X}, tli={}",
(wal_end >> 32) as u32,
wal_end as u32,
tli
);
}
impl XLogRecord {
@@ -350,10 +595,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Result<Bytes, Seriali
mod tests {
use super::*;
use regex::Regex;
use std::cmp::min;
use std::fs;
use std::{env, str::FromStr};
use utils::const_assert;
fn init_logging() {
let _ = env_logger::Builder::from_env(
@@ -364,7 +606,10 @@ mod tests {
.try_init();
}
fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
fn test_end_of_wal<C: wal_craft::Crafter>(
test_name: &str,
expected_end_of_wal_non_partial: Lsn,
) {
use wal_craft::*;
// Craft some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
@@ -385,7 +630,7 @@ mod tests {
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
@@ -397,10 +642,10 @@ mod tests {
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
for start_lsn in intermediate_lsns
.iter()
.chain(std::iter::once(&expected_end_of_wal))
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial);
for start_lsn in std::iter::once(Lsn(0))
.chain(intermediate_lsns)
.chain(std::iter::once(expected_end_of_wal_partial))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
@@ -415,7 +660,7 @@ mod tests {
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(*start_lsn) {
if seg_start_lsn > u64::from(start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
@@ -423,12 +668,18 @@ mod tests {
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(*start_lsn) - seg_start_lsn) as usize,
(u64::from(start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
check_end_of_wal(
&cfg,
&last_segment,
start_lsn,
expected_end_of_wal_non_partial,
expected_end_of_wal_partial,
);
}
}
@@ -465,15 +716,18 @@ mod tests {
cfg: &wal_craft::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal: Lsn,
expected_end_of_wal_non_partial: Lsn,
expected_end_of_wal_partial: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
// let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
// info!(
// "find_end_of_wal returned wal_end={} with non-partial WAL segment",
// wal_end
// );
// assert_eq!(wal_end, expected_end_of_wal_non_partial);
let (wal_end, tli) =
find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
let wal_end = Lsn(wal_end);
info!(
"find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment",
wal_end, tli
);
assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
fs::rename(
@@ -481,12 +735,14 @@ mod tests {
cfg.wal_dir().join(format!("{}.partial", last_segment)),
)
.unwrap();
let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
let (wal_end, tli) =
find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
let wal_end = Lsn(wal_end);
info!(
"find_end_of_wal returned wal_end={} with partial WAL segment",
wal_end
"find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment",
wal_end, tli
);
assert_eq!(wal_end, expected_end_of_wal);
assert_eq!(wal_end, expected_end_of_wal_partial);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
@@ -499,7 +755,10 @@ mod tests {
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
test_end_of_wal::<wal_craft::Simple>(
"test_find_end_of_wal_simple",
"0/2000000".parse::<Lsn>().unwrap(),
);
}
#[test]
@@ -507,14 +766,17 @@ mod tests {
init_logging();
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
"0/3000000".parse::<Lsn>().unwrap(),
);
}
#[test]
#[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
"test_find_end_of_wal_last_crossing_segment",
"0/3000000".parse::<Lsn>().unwrap(),
);
}

View File

@@ -265,7 +265,7 @@ mod tests {
use serde::{Deserialize, Serialize};
use std::io::Cursor;
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub struct ShortStruct {
a: u8,
b: u32,
@@ -286,7 +286,7 @@ mod tests {
const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7];
const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff];
#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, PartialEq, Serialize, Deserialize)]
pub struct LongMsg {
pub tag: u8,
pub blockpos: u32,

View File

@@ -10,10 +10,12 @@ pub fn get_request_param<'a>(
) -> Result<&'a str, ApiError> {
match request.param(param_name) {
Some(arg) => Ok(arg),
None => Err(ApiError::BadRequest(format!(
"no {} specified in path param",
param_name
))),
None => {
return Err(ApiError::BadRequest(format!(
"no {} specified in path param",
param_name
)))
}
}
}

View File

@@ -18,7 +18,7 @@ pub const XLOG_BLCKSZ: u32 = 8192;
pub struct Lsn(pub u64);
/// We tried to parse an LSN from a string, but failed
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[derive(Debug, PartialEq, thiserror::Error)]
#[error("LsnParseError")]
pub struct LsnParseError;

View File

@@ -50,7 +50,7 @@ pub trait Handler {
/// PostgresBackend protocol state.
/// XXX: The order of the constructors matters.
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd)]
#[derive(Clone, Copy, PartialEq, PartialOrd)]
pub enum ProtoState {
Initialization,
Encrypted,

View File

@@ -930,7 +930,7 @@ impl<'a> BeMessage<'a> {
// Neon extension of postgres replication protocol
// See NEON_STATUS_UPDATE_TAG_BYTE
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct ReplicationFeedback {
// Last known size of the timeline. Used to enforce timeline size limit.
pub current_timeline_size: u64,

View File

@@ -9,7 +9,7 @@ use std::sync::Mutex;
use std::time::Duration;
/// An error happened while waiting for a number
#[derive(Debug, PartialEq, Eq, thiserror::Error)]
#[derive(Debug, PartialEq, thiserror::Error)]
#[error("SeqWaitError")]
pub enum SeqWaitError {
/// The wait timeout was reached

View File

@@ -4,7 +4,7 @@ use serde::Deserialize;
use std::io::Read;
use utils::bin_ser::LeSer;
#[derive(Debug, PartialEq, Eq, Deserialize)]
#[derive(Debug, PartialEq, Deserialize)]
pub struct HeaderData {
magic: u16,
info: u16,

View File

@@ -30,9 +30,6 @@ static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
});
#[test]
// [false-positive](https://github.com/rust-lang/rust-clippy/issues/9274),
// we resize the vector so doing some modifications after all
#[allow(clippy::read_zero_byte_vec)]
fn ssl() {
let (mut client_sock, server_sock) = make_tcp_pair();

View File

@@ -167,7 +167,7 @@ fn local_timeline_info_from_repo_timeline(
) -> anyhow::Result<LocalTimelineInfo> {
match repo_timeline {
RepositoryTimeline::Loaded(timeline) => local_timeline_info_from_loaded_timeline(
timeline,
&*timeline,
include_non_incremental_logical_size,
include_non_incremental_physical_size,
),

View File

@@ -209,7 +209,7 @@ where
reader: R,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum VisitDirection {
Forwards,
Backwards,

View File

@@ -4,7 +4,6 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
use bytes::Bytes;
use fail::fail_point;
use itertools::Itertools;
use metrics::core::{AtomicU64, GenericCounter};
use once_cell::sync::Lazy;
use tracing::*;
@@ -224,70 +223,6 @@ impl From<LayeredTimelineEntry> for RepositoryTimeline<LayeredTimeline> {
}
}
struct TimelineMetrics {
pub reconstruct_time_histo: Histogram,
pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
pub flush_time_histo: Histogram,
pub compact_time_histo: Histogram,
pub create_images_time_histo: Histogram,
pub init_logical_size_histo: Histogram,
pub load_layer_map_histo: Histogram,
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub current_physical_size_gauge: UIntGauge,
}
impl TimelineMetrics {
fn new(tenant_id: &ZTenantId, timeline_id: &ZTimelineId) -> Self {
let tenant_id = tenant_id.to_string();
let timeline_id = timeline_id.to_string();
let reconstruct_time_histo = RECONSTRUCT_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let flush_time_histo = STORAGE_TIME
.get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id])
.unwrap();
let compact_time_histo = STORAGE_TIME
.get_metric_with_label_values(&["compact", &tenant_id, &timeline_id])
.unwrap();
let create_images_time_histo = STORAGE_TIME
.get_metric_with_label_values(&["create images", &tenant_id, &timeline_id])
.unwrap();
let init_logical_size_histo = STORAGE_TIME
.get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
.unwrap();
let load_layer_map_histo = STORAGE_TIME
.get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
.unwrap();
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let wait_lsn_time_histo = WAIT_LSN_TIME
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
TimelineMetrics {
reconstruct_time_histo,
materialized_page_cache_hit_counter,
flush_time_histo,
compact_time_histo,
create_images_time_histo,
init_logical_size_histo,
load_layer_map_histo,
last_record_gauge,
wait_lsn_time_histo,
current_physical_size_gauge,
}
}
}
pub struct LayeredTimeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -334,7 +269,14 @@ pub struct LayeredTimeline {
ancestor_lsn: Lsn,
// Metrics
metrics: TimelineMetrics,
reconstruct_time_histo: Histogram,
materialized_page_cache_hit_counter: IntCounter,
flush_time_histo: Histogram,
compact_time_histo: Histogram,
create_images_time_histo: Histogram,
last_record_gauge: IntGauge,
wait_lsn_time_histo: Histogram,
current_physical_size_gauge: UIntGauge,
/// If `true`, will backup its files that appear after each checkpointing to the remote storage.
upload_layers: AtomicBool,
@@ -484,7 +426,7 @@ impl Timeline for LayeredTimeline {
"wait_lsn called by WAL receiver thread"
);
self.metrics.wait_lsn_time_histo.observe_closure_duration(
self.wait_lsn_time_histo.observe_closure_duration(
|| self.last_record_lsn
.wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
.with_context(|| {
@@ -526,8 +468,7 @@ impl Timeline for LayeredTimeline {
self.get_reconstruct_data(key, lsn, &mut reconstruct_state)?;
self.metrics
.reconstruct_time_histo
self.reconstruct_time_histo
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
}
@@ -589,7 +530,7 @@ impl Timeline for LayeredTimeline {
}
fn get_physical_size(&self) -> u64 {
self.metrics.current_physical_size_gauge.get()
self.current_physical_size_gauge.get()
}
fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
@@ -663,6 +604,43 @@ impl LayeredTimeline {
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
upload_layers: bool,
) -> LayeredTimeline {
let reconstruct_time_histo = RECONSTRUCT_TIME
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
.unwrap();
let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
.unwrap();
let flush_time_histo = STORAGE_TIME
.get_metric_with_label_values(&[
"layer flush",
&tenant_id.to_string(),
&timeline_id.to_string(),
])
.unwrap();
let compact_time_histo = STORAGE_TIME
.get_metric_with_label_values(&[
"compact",
&tenant_id.to_string(),
&timeline_id.to_string(),
])
.unwrap();
let create_images_time_histo = STORAGE_TIME
.get_metric_with_label_values(&[
"create images",
&tenant_id.to_string(),
&timeline_id.to_string(),
])
.unwrap();
let last_record_gauge = LAST_RECORD_LSN
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
.unwrap();
let wait_lsn_time_histo = WAIT_LSN_TIME
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
.unwrap();
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id.to_string(), &timeline_id.to_string()])
.unwrap();
let mut result = LayeredTimeline {
conf,
tenant_conf,
@@ -685,7 +663,14 @@ impl LayeredTimeline {
ancestor_timeline: ancestor,
ancestor_lsn: metadata.ancestor_lsn(),
metrics: TimelineMetrics::new(&tenant_id, &timeline_id),
reconstruct_time_histo,
materialized_page_cache_hit_counter,
flush_time_histo,
compact_time_histo,
create_images_time_histo,
last_record_gauge,
wait_lsn_time_histo,
current_physical_size_gauge,
upload_layers: AtomicBool::new(upload_layers),
@@ -721,8 +706,6 @@ impl LayeredTimeline {
let mut layers = self.layers.write().unwrap();
let mut num_layers = 0;
let timer = self.metrics.load_layer_map_histo.start_timer();
// Scan timeline directory and create ImageFileName and DeltaFilename
// structs representing all files on disk
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
@@ -794,11 +777,7 @@ impl LayeredTimeline {
"loaded layer map with {} layers at {}, total physical size: {}",
num_layers, disk_consistent_lsn, total_physical_size
);
self.metrics
.current_physical_size_gauge
.set(total_physical_size);
timer.stop_and_record();
self.current_physical_size_gauge.set(total_physical_size);
Ok(())
}
@@ -829,16 +808,12 @@ impl LayeredTimeline {
}
}
let timer = self.metrics.init_logical_size_histo.start_timer();
// Have to calculate it the hard way
let last_lsn = self.get_last_record_lsn();
let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?;
self.current_logical_size
.store(logical_size as isize, AtomicOrdering::SeqCst);
debug!("calculated logical size the hard way: {}", logical_size);
timer.stop_and_record();
Ok(())
}
@@ -903,7 +878,7 @@ impl LayeredTimeline {
ValueReconstructResult::Continue => {
// If we reached an earlier cached page image, we're done.
if cont_lsn == cached_lsn + 1 {
self.metrics.materialized_page_cache_hit_counter.inc_by(1);
self.materialized_page_cache_hit_counter.inc_by(1);
return Ok(());
}
if prev_lsn <= cont_lsn {
@@ -1099,7 +1074,7 @@ impl LayeredTimeline {
fn finish_write(&self, new_lsn: Lsn) {
assert!(new_lsn.is_aligned());
self.metrics.last_record_gauge.set(new_lsn.0 as i64);
self.last_record_gauge.set(new_lsn.0 as i64);
self.last_record_lsn.advance(new_lsn);
}
@@ -1203,7 +1178,7 @@ impl LayeredTimeline {
}
};
let timer = self.metrics.flush_time_histo.start_timer();
let timer = self.flush_time_histo.start_timer();
loop {
let layers = self.layers.read().unwrap();
@@ -1374,7 +1349,7 @@ impl LayeredTimeline {
// update the timeline's physical size
let sz = new_delta_path.metadata()?.len();
self.metrics.current_physical_size_gauge.add(sz);
self.current_physical_size_gauge.add(sz);
// update metrics
NUM_PERSISTENT_FILES_CREATED.inc_by(1);
PERSISTENT_BYTES_WRITTEN.inc_by(sz);
@@ -1443,7 +1418,7 @@ impl LayeredTimeline {
}
// 3. Compact
let timer = self.metrics.compact_time_histo.start_timer();
let timer = self.compact_time_histo.start_timer();
self.compact_level0(target_file_size)?;
timer.stop_and_record();
}
@@ -1519,7 +1494,7 @@ impl LayeredTimeline {
lsn: Lsn,
force: bool,
) -> Result<HashSet<PathBuf>> {
let timer = self.metrics.create_images_time_histo.start_timer();
let timer = self.create_images_time_histo.start_timer();
let mut image_layers: Vec<ImageLayer> = Vec::new();
let mut layer_paths_to_upload = HashSet::new();
for partition in partitioning.parts.iter() {
@@ -1563,8 +1538,7 @@ impl LayeredTimeline {
let mut layers = self.layers.write().unwrap();
for l in image_layers {
self.metrics
.current_physical_size_gauge
self.current_physical_size_gauge
.add(l.path().metadata()?.len());
layers.insert_historic(Arc::new(l));
}
@@ -1814,8 +1788,7 @@ impl LayeredTimeline {
let new_delta_path = l.path();
// update the timeline's physical size
self.metrics
.current_physical_size_gauge
self.current_physical_size_gauge
.add(new_delta_path.metadata()?.len());
new_layer_paths.insert(new_delta_path);
@@ -1828,9 +1801,7 @@ impl LayeredTimeline {
drop(all_keys_iter);
for l in deltas_to_compact {
if let Some(path) = l.local_path() {
self.metrics
.current_physical_size_gauge
.sub(path.metadata()?.len());
self.current_physical_size_gauge.sub(path.metadata()?.len());
layer_paths_do_delete.insert(path);
}
l.delete()?;
@@ -2087,9 +2058,7 @@ impl LayeredTimeline {
let mut layer_paths_to_delete = HashSet::with_capacity(layers_to_remove.len());
for doomed_layer in layers_to_remove {
if let Some(path) = doomed_layer.local_path() {
self.metrics
.current_physical_size_gauge
.sub(path.metadata()?.len());
self.current_physical_size_gauge.sub(path.metadata()?.len());
layer_paths_to_delete.insert(path);
}
doomed_layer.delete()?;

View File

@@ -979,7 +979,7 @@ enum DownloadStatus {
#[derive(Debug)]
enum UploadStatus {
Uploaded,
Failed(anyhow::Error),
Failed,
Nothing,
}
@@ -1056,43 +1056,41 @@ where
let (upload_status, download_status) = tokio::join!(
async {
if let Some(upload_data) = upload_data {
let upload_retries = upload_data.retries;
match validate_task_retries(upload_retries, max_sync_errors)
match validate_task_retries(upload_data, max_sync_errors)
.instrument(info_span!("retries_validation"))
.await
{
ControlFlow::Continue(()) => {
ControlFlow::Continue(new_upload_data) => {
upload_timeline_data(
conf,
(storage.as_ref(), &index, sync_queue),
current_remote_timeline.as_ref(),
sync_id,
upload_data,
new_upload_data,
sync_start,
"upload",
)
.await
.await;
UploadStatus::Uploaded
}
ControlFlow::Break(()) => match update_remote_data(
conf,
storage.as_ref(),
&index,
sync_id,
RemoteDataUpdate::Upload {
uploaded_data: upload_data.data,
upload_failed: true,
},
)
.await
{
Ok(()) => UploadStatus::Failed(anyhow::anyhow!(
"Aborted after retries validation, current retries: {upload_retries}, max retries allowed: {max_sync_errors}"
)),
Err(e) => {
ControlFlow::Break(failed_upload_data) => {
if let Err(e) = update_remote_data(
conf,
storage.as_ref(),
&index,
sync_id,
RemoteDataUpdate::Upload {
uploaded_data: failed_upload_data.data,
upload_failed: true,
},
)
.await
{
error!("Failed to update remote timeline {sync_id}: {e:?}");
UploadStatus::Failed(e)
}
},
UploadStatus::Failed
}
}
} else {
UploadStatus::Nothing
@@ -1101,23 +1099,23 @@ where
.instrument(info_span!("upload_timeline_data")),
async {
if let Some(download_data) = download_data {
match validate_task_retries(download_data.retries, max_sync_errors)
match validate_task_retries(download_data, max_sync_errors)
.instrument(info_span!("retries_validation"))
.await
{
ControlFlow::Continue(()) => {
ControlFlow::Continue(new_download_data) => {
return download_timeline_data(
conf,
(storage.as_ref(), &index, sync_queue),
current_remote_timeline.as_ref(),
sync_id,
download_data,
new_download_data,
sync_start,
"download",
)
.await;
}
ControlFlow::Break(()) => {
ControlFlow::Break(_) => {
index
.write()
.await
@@ -1134,29 +1132,29 @@ where
if let Some(delete_data) = batch.delete {
match upload_status {
UploadStatus::Uploaded | UploadStatus::Nothing => {
match validate_task_retries(delete_data.retries, max_sync_errors)
match validate_task_retries(delete_data, max_sync_errors)
.instrument(info_span!("retries_validation"))
.await
{
ControlFlow::Continue(()) => {
ControlFlow::Continue(new_delete_data) => {
delete_timeline_data(
conf,
(storage.as_ref(), &index, sync_queue),
sync_id,
delete_data,
new_delete_data,
sync_start,
"delete",
)
.instrument(info_span!("delete_timeline_data"))
.await;
}
ControlFlow::Break(()) => {
ControlFlow::Break(failed_delete_data) => {
if let Err(e) = update_remote_data(
conf,
storage.as_ref(),
&index,
sync_id,
RemoteDataUpdate::Delete(&delete_data.data.deleted_layers),
RemoteDataUpdate::Delete(&failed_delete_data.data.deleted_layers),
)
.await
{
@@ -1165,8 +1163,8 @@ where
}
}
}
UploadStatus::Failed(e) => {
warn!("Skipping delete task due to failed upload tasks, reenqueuing. Upload data: {:?}, delete data: {delete_data:?}. Upload failure: {e:#}", batch.upload);
UploadStatus::Failed => {
warn!("Skipping delete task due to failed upload tasks, reenqueuing");
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
}
}
@@ -1351,8 +1349,7 @@ async fn upload_timeline_data<P, S>(
new_upload_data: SyncData<LayersUpload>,
sync_start: Instant,
task_name: &str,
) -> UploadStatus
where
) where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
@@ -1365,9 +1362,9 @@ where
)
.await
{
UploadedTimeline::FailedAndRescheduled(e) => {
UploadedTimeline::FailedAndRescheduled => {
register_sync_status(sync_id, sync_start, task_name, Some(false));
return UploadStatus::Failed(e);
return;
}
UploadedTimeline::Successful(upload_data) => upload_data,
};
@@ -1386,14 +1383,12 @@ where
{
Ok(()) => {
register_sync_status(sync_id, sync_start, task_name, Some(true));
UploadStatus::Uploaded
}
Err(e) => {
error!("Failed to update remote timeline {sync_id}: {e:?}");
uploaded_data.retries += 1;
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
register_sync_status(sync_id, sync_start, task_name, Some(false));
UploadStatus::Failed(e)
}
}
}
@@ -1496,17 +1491,21 @@ where
.context("Failed to upload new index part")
}
async fn validate_task_retries(
current_attempt: u32,
async fn validate_task_retries<T>(
sync_data: SyncData<T>,
max_sync_errors: NonZeroU32,
) -> ControlFlow<(), ()> {
) -> ControlFlow<SyncData<T>, SyncData<T>> {
let current_attempt = sync_data.retries;
let max_sync_errors = max_sync_errors.get();
if current_attempt >= max_sync_errors {
return ControlFlow::Break(());
error!(
"Aborting task that failed {current_attempt} times, exceeding retries threshold of {max_sync_errors}",
);
return ControlFlow::Break(sync_data);
}
exponential_backoff(current_attempt, 1.0, 30.0).await;
ControlFlow::Continue(())
ControlFlow::Continue(sync_data)
}
fn schedule_first_sync_tasks(

View File

@@ -95,8 +95,6 @@ where
debug!("Reenqueuing failed delete task for timeline {sync_id}");
delete_data.retries += 1;
sync_queue.push(sync_id, SyncTask::Delete(delete_data));
} else {
info!("Successfully deleted all layers");
}
errored
}

View File

@@ -75,7 +75,7 @@ where
#[derive(Debug)]
pub(super) enum UploadedTimeline {
/// Upload failed due to some error, the upload task is rescheduled for another retry.
FailedAndRescheduled(anyhow::Error),
FailedAndRescheduled,
/// No issues happened during the upload, all task files were put into the remote storage.
Successful(SyncData<LayersUpload>),
}
@@ -179,7 +179,7 @@ where
})
.collect::<FuturesUnordered<_>>();
let mut errors = Vec::new();
let mut errors_happened = false;
while let Some(upload_result) = upload_tasks.next().await {
match upload_result {
Ok(uploaded_path) => {
@@ -188,13 +188,13 @@ where
}
Err(e) => match e {
UploadError::Other(e) => {
errors_happened = true;
error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
errors.push(format!("{e:#}"));
}
UploadError::MissingLocalFile(source_path, e) => {
if source_path.exists() {
errors_happened = true;
error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
errors.push(format!("{e:#}"));
} else {
// We have run the upload sync task, but the file we wanted to upload is gone.
// This is "fine" due the asynchronous nature of the sync loop: it only reacts to events and might need to
@@ -217,17 +217,14 @@ where
}
}
if errors.is_empty() {
info!("Successfully uploaded all layers");
UploadedTimeline::Successful(upload_data)
} else {
if errors_happened {
debug!("Reenqueuing failed upload task for timeline {sync_id}");
upload_data.retries += 1;
sync_queue.push(sync_id, SyncTask::Upload(upload_data));
UploadedTimeline::FailedAndRescheduled(anyhow::anyhow!(
"Errors appeared during layer uploads: {:?}",
errors
))
UploadedTimeline::FailedAndRescheduled
} else {
info!("Successfully uploaded all layers");
UploadedTimeline::Successful(upload_data)
}
}

View File

@@ -37,7 +37,7 @@ pub mod defaults {
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "3 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
}

View File

@@ -17,7 +17,7 @@ use std::{
};
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use chrono::{DateTime, Local, NaiveDateTime, Utc};
use etcd_broker::{
subscription_key::SubscriptionKey, subscription_value::SkTimelineInfo, BrokerSubscription,
BrokerUpdate, Client,
@@ -33,10 +33,11 @@ use crate::{
use crate::{RepositoryImpl, TimelineImpl};
use utils::{
lsn::Lsn,
pq_proto::ReplicationFeedback,
zid::{NodeId, ZTenantTimelineId},
};
use super::{walreceiver_connection::WalConnectionStatus, TaskEvent, TaskHandle};
use super::{TaskEvent, TaskHandle};
/// Spawns the loop to take care of the timeline's WAL streaming connection.
pub(super) fn spawn_connection_manager_task(
@@ -113,26 +114,21 @@ async fn connection_manager_loop_step(
}
} => {
let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard");
match wal_connection_update {
match &wal_connection_update {
TaskEvent::Started => {
wal_connection.latest_connection_update = Utc::now().naive_utc();
*walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1;
},
TaskEvent::NewEvent(status) => {
if status.has_received_wal {
// Reset connection attempts here only, we know that safekeeper is healthy
// because it can send us a WAL update.
walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
}
wal_connection.status = status;
TaskEvent::NewEvent(replication_feedback) => {
wal_connection.latest_connection_update = DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc();
// reset connection attempts here only, the only place where both nodes
// explicitly confirmn with replication feedback that they are connected to each other
walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
},
TaskEvent::End(end_result) => {
match end_result {
Ok(()) => debug!("WAL receiving task finished"),
Err(e) => {
warn!("WAL receiving task failed: {e}");
// If the task failed, set the connection attempts to at least 1, to try other safekeepers.
let _ = *walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(1);
}
Err(e) => warn!("WAL receiving task failed: {e}"),
};
walreceiver_state.wal_connection = None;
},
@@ -261,21 +257,10 @@ struct WalreceiverState {
struct WalConnection {
/// Current safekeeper pageserver is connected to for WAL streaming.
sk_id: NodeId,
/// Status of the connection.
status: WalConnectionStatus,
/// Connection task start time or the timestamp of a latest connection message received.
latest_connection_update: NaiveDateTime,
/// WAL streaming task handle.
connection_task: TaskHandle<WalConnectionStatus>,
/// Have we discovered that other safekeeper has more recent WAL than we do?
discovered_new_wal: Option<NewCommittedWAL>,
}
/// Notion of a new committed WAL, which exists on other safekeeper.
#[derive(Debug, Clone, Copy)]
struct NewCommittedWAL {
/// LSN of the new committed WAL.
lsn: Lsn,
/// When we discovered that the new committed WAL exists on other safekeeper.
discovered_at: NaiveDateTime,
connection_task: TaskHandle<ReplicationFeedback>,
}
/// Data about the timeline to connect to, received from etcd.
@@ -342,19 +327,10 @@ impl WalreceiverState {
.instrument(info_span!("walreceiver_connection", id = %id))
});
let now = Utc::now().naive_utc();
self.wal_connection = Some(WalConnection {
sk_id: new_sk_id,
status: WalConnectionStatus {
is_connected: false,
has_received_wal: false,
latest_connection_update: now,
latest_wal_update: now,
streaming_lsn: None,
commit_lsn: None,
},
latest_connection_update: Utc::now().naive_utc(),
connection_task: connection_handle,
discovered_new_wal: None,
});
}
@@ -385,16 +361,14 @@ impl WalreceiverState {
/// Cleans up stale etcd records and checks the rest for the new connection candidate.
/// Returns a new candidate, if the current state is absent or somewhat lagging, `None` otherwise.
/// The current rules for approving new candidates:
/// * pick a candidate different from the connected safekeeper with biggest `commit_lsn` and lowest failed connection attemps
/// * pick from the input data from etcd for currently connected safekeeper (if any)
/// * out of the rest input entries, pick one with biggest `commit_lsn` that's after than pageserver's latest Lsn for the timeline
/// * if there's no such entry, no new candidate found, abort
/// * otherwise check if the candidate is much better than the current one
///
/// To understand exact rules for determining if the candidate is better than the current one, refer to this function's implementation.
/// General rules are following:
/// * if connected safekeeper is not present, pick the candidate
/// * if we haven't received any updates for some time, pick the candidate
/// * if the candidate commit_lsn is much higher than the current one, pick the candidate
/// * if connected safekeeper stopped sending us new WAL which is available on other safekeeper, pick the candidate
/// * check the current connection time data for staleness, reconnect if stale
/// * otherwise, check if etcd updates contain currently connected safekeeper
/// * if not, that means no WAL updates happened after certain time (either none since the connection time or none since the last event after the connection)
/// Reconnect if the time exceeds the threshold.
/// * if there's one, compare its Lsn with the other candidate's, reconnect if candidate's over threshold
///
/// This way we ensure to keep up with the most up-to-date safekeeper and don't try to jump from one safekeeper to another too frequently.
/// Both thresholds are configured per tenant.
@@ -410,128 +384,53 @@ impl WalreceiverState {
let now = Utc::now().naive_utc();
if let Ok(latest_interaciton) =
(now - existing_wal_connection.status.latest_connection_update).to_std()
(now - existing_wal_connection.latest_connection_update).to_std()
{
// Drop connection if we haven't received keepalive message for a while.
if latest_interaciton > self.wal_connect_timeout {
if latest_interaciton > self.lagging_wal_timeout {
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connstr: new_wal_source_connstr,
reason: ReconnectReason::NoKeepAlives {
last_keep_alive: Some(
existing_wal_connection.status.latest_connection_update,
reason: ReconnectReason::NoWalTimeout {
last_wal_interaction: Some(
existing_wal_connection.latest_connection_update,
),
check_time: now,
threshold: self.wal_connect_timeout,
threshold: self.lagging_wal_timeout,
},
});
}
}
if !existing_wal_connection.status.is_connected {
// We haven't connected yet and we shouldn't switch until connection timeout (condition above).
return None;
}
if let Some(current_commit_lsn) = existing_wal_connection.status.commit_lsn {
let new_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
// Check if the new candidate has much more WAL than the current one.
match new_commit_lsn.0.checked_sub(current_commit_lsn.0) {
Some(new_sk_lsn_advantage) => {
if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() {
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connstr: new_wal_source_connstr,
reason: ReconnectReason::LaggingWal {
current_commit_lsn,
new_commit_lsn,
threshold: self.max_lsn_wal_lag,
},
});
match self.wal_stream_candidates.get(&connected_sk_node) {
Some(current_connection_etcd_data) => {
let new_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
let current_lsn = current_connection_etcd_data
.timeline
.commit_lsn
.unwrap_or(Lsn(0));
match new_lsn.0.checked_sub(current_lsn.0)
{
Some(new_sk_lsn_advantage) => {
if new_sk_lsn_advantage >= self.max_lsn_wal_lag.get() {
return Some(
NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connstr: new_wal_source_connstr,
reason: ReconnectReason::LaggingWal { current_lsn, new_lsn, threshold: self.max_lsn_wal_lag },
});
}
}
None => debug!("Best SK candidate has its commit Lsn behind the current timeline's latest consistent Lsn"),
}
}
None => debug!(
"Best SK candidate has its commit_lsn behind connected SK's commit_lsn"
),
}
}
let current_lsn = match existing_wal_connection.status.streaming_lsn {
Some(lsn) => lsn,
None => self.local_timeline.get_last_record_lsn(),
};
let current_commit_lsn = existing_wal_connection
.status
.commit_lsn
.unwrap_or(current_lsn);
let candidate_commit_lsn = new_safekeeper_etcd_data.commit_lsn.unwrap_or(Lsn(0));
// Keep discovered_new_wal only if connected safekeeper has not caught up yet.
let mut discovered_new_wal = existing_wal_connection
.discovered_new_wal
.filter(|new_wal| new_wal.lsn > current_commit_lsn);
if discovered_new_wal.is_none() {
// Check if the new candidate has more WAL than the current one.
// If the new candidate has more WAL than the current one, we consider switching to the new candidate.
discovered_new_wal = if candidate_commit_lsn > current_commit_lsn {
trace!(
"New candidate has commit_lsn {}, higher than current_commit_lsn {}",
candidate_commit_lsn,
current_commit_lsn
);
Some(NewCommittedWAL {
lsn: candidate_commit_lsn,
discovered_at: Utc::now().naive_utc(),
None => {
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connstr: new_wal_source_connstr,
reason: ReconnectReason::NoEtcdDataForExistingConnection,
})
} else {
None
};
}
let waiting_for_new_lsn_since = if current_lsn < current_commit_lsn {
// Connected safekeeper has more WAL, but we haven't received updates for some time.
trace!(
"Connected safekeeper has more WAL, but we haven't received updates for {:?}. current_lsn: {}, current_commit_lsn: {}",
(now - existing_wal_connection.status.latest_wal_update).to_std(),
current_lsn,
current_commit_lsn
);
Some(existing_wal_connection.status.latest_wal_update)
} else {
discovered_new_wal.as_ref().map(|new_wal| {
// We know that new WAL is available on other safekeeper, but connected safekeeper don't have it.
new_wal
.discovered_at
.max(existing_wal_connection.status.latest_wal_update)
})
};
// If we haven't received any WAL updates for a while and candidate has more WAL, switch to it.
if let Some(waiting_for_new_lsn_since) = waiting_for_new_lsn_since {
if let Ok(waiting_for_new_wal) = (now - waiting_for_new_lsn_since).to_std() {
if candidate_commit_lsn > current_commit_lsn
&& waiting_for_new_wal > self.lagging_wal_timeout
{
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_source_connstr: new_wal_source_connstr,
reason: ReconnectReason::NoWalTimeout {
current_lsn,
current_commit_lsn,
candidate_commit_lsn,
last_wal_interaction: Some(
existing_wal_connection.status.latest_wal_update,
),
check_time: now,
threshold: self.lagging_wal_timeout,
},
});
}
}
}
self.wal_connection.as_mut().unwrap().discovered_new_wal = discovered_new_wal;
}
None => {
let (new_sk_id, _, new_wal_source_connstr) =
@@ -551,7 +450,7 @@ impl WalreceiverState {
/// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
///
/// The candidate that is chosen:
/// * has fewest connection attempts from pageserver to safekeeper node (reset every time we receive a WAL message from the node)
/// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent)
/// * has greatest data Lsn among the ones that are left
///
/// NOTE:
@@ -590,13 +489,14 @@ impl WalreceiverState {
.max_by_key(|(_, info, _)| info.commit_lsn)
}
/// Returns a list of safekeepers that have valid info and ready for connection.
fn applicable_connection_candidates(
&self,
) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, String)> {
self.wal_stream_candidates
.iter()
.filter(|(_, info)| info.timeline.commit_lsn.is_some())
.filter(|(_, etcd_info)| {
etcd_info.timeline.commit_lsn > Some(self.local_timeline.get_last_record_lsn())
})
.filter_map(|(sk_id, etcd_info)| {
let info = &etcd_info.timeline;
match wal_stream_connection_string(
@@ -612,7 +512,6 @@ impl WalreceiverState {
})
}
/// Remove candidates which haven't sent etcd updates for a while.
fn cleanup_old_candidates(&mut self) {
let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
@@ -647,24 +546,17 @@ struct NewWalConnectionCandidate {
#[derive(Debug, PartialEq, Eq)]
enum ReconnectReason {
NoExistingConnection,
NoEtcdDataForExistingConnection,
LaggingWal {
current_commit_lsn: Lsn,
new_commit_lsn: Lsn,
current_lsn: Lsn,
new_lsn: Lsn,
threshold: NonZeroU64,
},
NoWalTimeout {
current_lsn: Lsn,
current_commit_lsn: Lsn,
candidate_commit_lsn: Lsn,
last_wal_interaction: Option<NaiveDateTime>,
check_time: NaiveDateTime,
threshold: Duration,
},
NoKeepAlives {
last_keep_alive: Option<NaiveDateTime>,
check_time: NaiveDateTime,
threshold: Duration,
},
}
fn wal_stream_connection_string(
@@ -688,6 +580,7 @@ fn wal_stream_connection_string(
#[cfg(test)]
mod tests {
use std::time::SystemTime;
use crate::repository::{
repo_harness::{RepoHarness, TIMELINE_ID},
@@ -765,7 +658,7 @@ mod tests {
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
safekeeper_connstr: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
},
etcd_version: 0,
latest_update: delay_over_threshold,
@@ -791,26 +684,22 @@ mod tests {
let connected_sk_id = NodeId(0);
let current_lsn = 100_000;
let connection_status = WalConnectionStatus {
is_connected: true,
has_received_wal: true,
latest_connection_update: now,
latest_wal_update: now,
commit_lsn: Some(Lsn(current_lsn)),
streaming_lsn: Some(Lsn(current_lsn)),
};
state.max_lsn_wal_lag = NonZeroU64::new(100).unwrap();
state.wal_connection = Some(WalConnection {
sk_id: connected_sk_id,
status: connection_status.clone(),
latest_connection_update: now,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskEvent::NewEvent(connection_status.clone()))
.send(TaskEvent::NewEvent(ReplicationFeedback {
current_timeline_size: 1,
ps_writelsn: 1,
ps_applylsn: current_lsn,
ps_flushlsn: 1,
ps_replytime: SystemTime::now(),
}))
.ok();
Ok(())
}),
discovered_new_wal: None,
});
state.wal_stream_candidates = HashMap::from([
(
@@ -1035,6 +924,65 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> {
let harness = RepoHarness::create("connection_no_etcd_data_candidate")?;
let mut state = dummy_state(&harness);
let now = Utc::now().naive_utc();
let current_lsn = Lsn(100_000).align();
let connected_sk_id = NodeId(0);
let other_sk_id = NodeId(connected_sk_id.0 + 1);
state.wal_connection = Some(WalConnection {
sk_id: connected_sk_id,
latest_connection_update: now,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskEvent::NewEvent(ReplicationFeedback {
current_timeline_size: 1,
ps_writelsn: current_lsn.0,
ps_applylsn: 1,
ps_flushlsn: 1,
ps_replytime: SystemTime::now(),
}))
.ok();
Ok(())
}),
});
state.wal_stream_candidates = HashMap::from([(
other_sk_id,
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(Lsn(1 + state.max_lsn_wal_lag.get())),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
},
etcd_version: 0,
latest_update: now,
},
)]);
let only_candidate = state
.next_connection_candidate()
.expect("Expected one candidate selected out of the only data option, but got none");
assert_eq!(only_candidate.safekeeper_id, other_sk_id);
assert_eq!(
only_candidate.reason,
ReconnectReason::NoEtcdDataForExistingConnection,
"Should select new safekeeper due to missing etcd data, even if there's an existing connection with this safekeeper"
);
assert!(only_candidate
.wal_source_connstr
.contains(DUMMY_SAFEKEEPER_CONNSTR));
Ok(())
}
#[tokio::test]
async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = RepoHarness::create("lsn_wal_over_threshcurrent_candidate")?;
@@ -1045,25 +993,21 @@ mod tests {
let connected_sk_id = NodeId(0);
let new_lsn = Lsn(current_lsn.0 + state.max_lsn_wal_lag.get() + 1);
let connection_status = WalConnectionStatus {
is_connected: true,
has_received_wal: true,
latest_connection_update: now,
latest_wal_update: now,
commit_lsn: Some(current_lsn),
streaming_lsn: Some(current_lsn),
};
state.wal_connection = Some(WalConnection {
sk_id: connected_sk_id,
status: connection_status.clone(),
latest_connection_update: now,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskEvent::NewEvent(connection_status.clone()))
.send(TaskEvent::NewEvent(ReplicationFeedback {
current_timeline_size: 1,
ps_writelsn: current_lsn.0,
ps_applylsn: 1,
ps_flushlsn: 1,
ps_replytime: SystemTime::now(),
}))
.ok();
Ok(())
}),
discovered_new_wal: None,
});
state.wal_stream_candidates = HashMap::from([
(
@@ -1108,8 +1052,8 @@ mod tests {
assert_eq!(
over_threshcurrent_candidate.reason,
ReconnectReason::LaggingWal {
current_commit_lsn: current_lsn,
new_commit_lsn: new_lsn,
current_lsn,
new_lsn,
threshold: state.max_lsn_wal_lag
},
"Should select bigger WAL safekeeper if it starts to lag enough"
@@ -1122,35 +1066,31 @@ mod tests {
}
#[tokio::test]
async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = RepoHarness::create("timeout_connection_threshhold_current_candidate")?;
async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?;
let mut state = dummy_state(&harness);
let current_lsn = Lsn(100_000).align();
let now = Utc::now().naive_utc();
let wal_connect_timeout = chrono::Duration::from_std(state.wal_connect_timeout)?;
let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?;
let time_over_threshold =
Utc::now().naive_utc() - wal_connect_timeout - wal_connect_timeout;
let connection_status = WalConnectionStatus {
is_connected: true,
has_received_wal: true,
latest_connection_update: time_over_threshold,
latest_wal_update: time_over_threshold,
commit_lsn: Some(current_lsn),
streaming_lsn: Some(current_lsn),
};
Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout;
state.wal_connection = Some(WalConnection {
sk_id: NodeId(1),
status: connection_status.clone(),
latest_connection_update: time_over_threshold,
connection_task: TaskHandle::spawn(move |sender, _| async move {
sender
.send(TaskEvent::NewEvent(connection_status.clone()))
.send(TaskEvent::NewEvent(ReplicationFeedback {
current_timeline_size: 1,
ps_writelsn: current_lsn.0,
ps_applylsn: 1,
ps_flushlsn: 1,
ps_replytime: SystemTime::now(),
}))
.ok();
Ok(())
}),
discovered_new_wal: None,
});
state.wal_stream_candidates = HashMap::from([(
NodeId(0),
@@ -1175,12 +1115,12 @@ mod tests {
assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0));
match over_threshcurrent_candidate.reason {
ReconnectReason::NoKeepAlives {
last_keep_alive,
ReconnectReason::NoWalTimeout {
last_wal_interaction,
threshold,
..
} => {
assert_eq!(last_keep_alive, Some(time_over_threshold));
assert_eq!(last_wal_interaction, Some(time_over_threshold));
assert_eq!(threshold, state.lagging_wal_timeout);
}
unexpected => panic!("Unexpected reason: {unexpected:?}"),
@@ -1193,34 +1133,20 @@ mod tests {
}
#[tokio::test]
async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = RepoHarness::create("timeout_wal_over_threshhold_current_candidate")?;
async fn timeout_connection_over_threshhold_current_candidate() -> anyhow::Result<()> {
let harness = RepoHarness::create("timeout_connection_over_threshhold_current_candidate")?;
let mut state = dummy_state(&harness);
let current_lsn = Lsn(100_000).align();
let new_lsn = Lsn(100_100).align();
let now = Utc::now().naive_utc();
let lagging_wal_timeout = chrono::Duration::from_std(state.lagging_wal_timeout)?;
let time_over_threshold =
Utc::now().naive_utc() - lagging_wal_timeout - lagging_wal_timeout;
let connection_status = WalConnectionStatus {
is_connected: true,
has_received_wal: true,
latest_connection_update: now,
latest_wal_update: time_over_threshold,
commit_lsn: Some(current_lsn),
streaming_lsn: Some(current_lsn),
};
state.wal_connection = Some(WalConnection {
sk_id: NodeId(1),
status: connection_status,
latest_connection_update: time_over_threshold,
connection_task: TaskHandle::spawn(move |_, _| async move { Ok(()) }),
discovered_new_wal: Some(NewCommittedWAL {
discovered_at: time_over_threshold,
lsn: new_lsn,
}),
});
state.wal_stream_candidates = HashMap::from([(
NodeId(0),
@@ -1228,7 +1154,7 @@ mod tests {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(new_lsn),
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
@@ -1246,16 +1172,10 @@ mod tests {
assert_eq!(over_threshcurrent_candidate.safekeeper_id, NodeId(0));
match over_threshcurrent_candidate.reason {
ReconnectReason::NoWalTimeout {
current_lsn,
current_commit_lsn,
candidate_commit_lsn,
last_wal_interaction,
threshold,
..
} => {
assert_eq!(current_lsn, current_lsn);
assert_eq!(current_commit_lsn, current_lsn);
assert_eq!(candidate_commit_lsn, new_lsn);
assert_eq!(last_wal_interaction, Some(time_over_threshold));
assert_eq!(threshold, state.lagging_wal_timeout);
}
@@ -1282,7 +1202,7 @@ mod tests {
.expect("Failed to create an empty timeline for dummy wal connection manager"),
wal_connect_timeout: Duration::from_secs(1),
lagging_wal_timeout: Duration::from_secs(1),
max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
max_lsn_wal_lag: NonZeroU64::new(1).unwrap(),
wal_connection: None,
wal_stream_candidates: HashMap::new(),
wal_connection_attempts: HashMap::new(),

View File

@@ -8,7 +8,6 @@ use std::{
use anyhow::{bail, ensure, Context};
use bytes::BytesMut;
use chrono::{NaiveDateTime, Utc};
use fail::fail_point;
use futures::StreamExt;
use postgres::{SimpleQueryMessage, SimpleQueryRow};
@@ -30,29 +29,12 @@ use crate::{
use postgres_ffi::waldecoder::WalStreamDecoder;
use utils::{lsn::Lsn, pq_proto::ReplicationFeedback, zid::ZTenantTimelineId};
/// Status of the connection.
#[derive(Debug, Clone)]
pub struct WalConnectionStatus {
/// If we were able to initiate a postgres connection, this means that safekeeper process is at least running.
pub is_connected: bool,
/// Defines a healthy connection as one on which we have received at least some WAL bytes.
pub has_received_wal: bool,
/// Connection establishment time or the timestamp of a latest connection message received.
pub latest_connection_update: NaiveDateTime,
/// Time of the latest WAL message received.
pub latest_wal_update: NaiveDateTime,
/// Latest WAL update contained WAL up to this LSN. Next WAL message with start from that LSN.
pub streaming_lsn: Option<Lsn>,
/// Latest commit_lsn received from the safekeeper. Can be zero if no message has been received yet.
pub commit_lsn: Option<Lsn>,
}
/// Open a connection to the given safekeeper and receive WAL, sending back progress
/// messages as we go.
pub async fn handle_walreceiver_connection(
id: ZTenantTimelineId,
wal_source_connstr: &str,
events_sender: &watch::Sender<TaskEvent<WalConnectionStatus>>,
events_sender: &watch::Sender<TaskEvent<ReplicationFeedback>>,
mut cancellation: watch::Receiver<()>,
connect_timeout: Duration,
) -> anyhow::Result<()> {
@@ -67,26 +49,12 @@ pub async fn handle_walreceiver_connection(
.await
.context("Timed out while waiting for walreceiver connection to open")?
.context("Failed to open walreceiver conection")?;
info!("connected!");
let mut connection_status = WalConnectionStatus {
is_connected: true,
has_received_wal: false,
latest_connection_update: Utc::now().naive_utc(),
latest_wal_update: Utc::now().naive_utc(),
streaming_lsn: None,
commit_lsn: None,
};
if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
return Ok(());
}
// The connection object performs the actual communication with the database,
// so spawn it off to run on its own.
let mut connection_cancellation = cancellation.clone();
tokio::spawn(
async move {
info!("connected!");
select! {
connection_result = connection => match connection_result{
Ok(()) => info!("Walreceiver db connection closed"),
@@ -116,14 +84,6 @@ pub async fn handle_walreceiver_connection(
let identify = identify_system(&mut replication_client).await?;
info!("{identify:?}");
connection_status.latest_connection_update = Utc::now().naive_utc();
if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
return Ok(());
}
// NB: this is a flush_lsn, not a commit_lsn.
let end_of_wal = Lsn::from(u64::from(identify.xlogpos));
let mut caught_up = false;
let ZTenantTimelineId {
@@ -158,7 +118,7 @@ pub async fn handle_walreceiver_connection(
// There might be some padding after the last full record, skip it.
startpoint += startpoint.calc_padding(8u32);
info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, safekeeper is at {end_of_wal}...");
info!("last_record_lsn {last_rec_lsn} starting replication from {startpoint}, server is at {end_of_wal}...");
let query = format!("START_REPLICATION PHYSICAL {startpoint}");
@@ -180,33 +140,6 @@ pub async fn handle_walreceiver_connection(
}
} {
let replication_message = replication_message?;
let now = Utc::now().naive_utc();
// Update the connection status before processing the message. If the message processing
// fails (e.g. in walingest), we still want to know latests LSNs from the safekeeper.
match &replication_message {
ReplicationMessage::XLogData(xlog_data) => {
connection_status.latest_connection_update = now;
connection_status.commit_lsn = Some(Lsn::from(xlog_data.wal_end()));
connection_status.streaming_lsn = Some(Lsn::from(
xlog_data.wal_start() + xlog_data.data().len() as u64,
));
if !xlog_data.data().is_empty() {
connection_status.latest_wal_update = now;
connection_status.has_received_wal = true;
}
}
ReplicationMessage::PrimaryKeepAlive(keepalive) => {
connection_status.latest_connection_update = now;
connection_status.commit_lsn = Some(Lsn::from(keepalive.wal_end()));
}
&_ => {}
};
if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
warn!("Wal connection event listener dropped, aborting the connection: {e}");
return Ok(());
}
let status_update = match replication_message {
ReplicationMessage::XLogData(xlog_data) => {
// Pass the WAL data to the decoder, and see if we can decode
@@ -324,6 +257,10 @@ pub async fn handle_walreceiver_connection(
.as_mut()
.zenith_status_update(data.len() as u64, &data)
.await?;
if let Err(e) = events_sender.send(TaskEvent::NewEvent(zenith_status_update)) {
warn!("Wal connection event listener dropped, aborting the connection: {e}");
return Ok(());
}
}
}

49
poetry.lock generated

File diff suppressed because one or more lines are too long

View File

@@ -7,7 +7,6 @@ edition = "2021"
anyhow = "1.0"
async-trait = "0.1"
base64 = "0.13.0"
bstr = "0.2.17"
bytes = { version = "1.0.1", features = ['serde'] }
clap = "3.0"
futures = "0.3.13"

View File

@@ -12,7 +12,7 @@ use password_hack::PasswordHackPayload;
mod flow;
pub use flow::*;
use crate::error::UserFacingError;
use crate::{error::UserFacingError, waiters};
use std::io;
use thiserror::Error;
@@ -22,54 +22,51 @@ pub type Result<T> = std::result::Result<T, AuthError>;
/// Common authentication error.
#[derive(Debug, Error)]
pub enum AuthErrorImpl {
// This will be dropped in the future.
/// Authentication error reported by the console.
#[error(transparent)]
Legacy(#[from] backend::LegacyAuthError),
Console(#[from] backend::AuthError),
#[error(transparent)]
Link(#[from] backend::LinkAuthError),
GetAuthInfo(#[from] backend::console::ConsoleAuthError),
#[error(transparent)]
GetAuthInfo(#[from] backend::GetAuthInfoError),
#[error(transparent)]
WakeCompute(#[from] backend::WakeComputeError),
/// SASL protocol errors (includes [SCRAM](crate::scram)).
#[error(transparent)]
Sasl(#[from] crate::sasl::Error),
#[error("Unsupported authentication method: {0}")]
BadAuthMethod(Box<str>),
#[error("Malformed password message: {0}")]
MalformedPassword(&'static str),
#[error(
"Project name is not specified. \
Either please upgrade the postgres client library (libpq) for SNI support \
or pass the project name as a parameter: '&options=project%3D<project-name>'. \
See more at https://neon.tech/sni"
)]
MissingProjectName,
/// Errors produced by e.g. [`crate::stream::PqStream`].
/// Errors produced by [`crate::stream::PqStream`].
#[error(transparent)]
Io(#[from] io::Error),
}
impl AuthErrorImpl {
pub fn auth_failed(msg: impl Into<String>) -> Self {
Self::Console(backend::AuthError::auth_failed(msg))
}
}
impl From<waiters::RegisterError> for AuthErrorImpl {
fn from(e: waiters::RegisterError) -> Self {
Self::Console(backend::AuthError::from(e))
}
}
impl From<waiters::WaitError> for AuthErrorImpl {
fn from(e: waiters::WaitError) -> Self {
Self::Console(backend::AuthError::from(e))
}
}
#[derive(Debug, Error)]
#[error(transparent)]
pub struct AuthError(Box<AuthErrorImpl>);
impl AuthError {
pub fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
AuthErrorImpl::BadAuthMethod(name.into()).into()
}
}
impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
fn from(e: E) -> Self {
impl<T> From<T> for AuthError
where
AuthErrorImpl: From<T>,
{
fn from(e: T) -> Self {
Self(Box::new(e.into()))
}
}
@@ -78,14 +75,10 @@ impl UserFacingError for AuthError {
fn to_string_client(&self) -> String {
use AuthErrorImpl::*;
match self.0.as_ref() {
Legacy(e) => e.to_string_client(),
Link(e) => e.to_string_client(),
Console(e) => e.to_string_client(),
GetAuthInfo(e) => e.to_string_client(),
WakeCompute(e) => e.to_string_client(),
Sasl(e) => e.to_string_client(),
BadAuthMethod(_) => self.to_string(),
MalformedPassword(_) => self.to_string(),
MissingProjectName => self.to_string(),
_ => "Internal error".to_string(),
}
}

View File

@@ -1,13 +1,10 @@
mod link;
mod postgres;
mod link;
pub use link::LinkAuthError;
mod console;
pub use console::{GetAuthInfoError, WakeComputeError};
pub mod console;
mod legacy_console;
pub use legacy_console::LegacyAuthError;
pub use legacy_console::{AuthError, AuthErrorImpl};
use crate::{
auth::{self, AuthFlow, ClientCredentials},
@@ -86,7 +83,7 @@ impl From<DatabaseInfo> for tokio_postgres::Config {
/// * However, when we substitute `T` with [`ClientCredentials`],
/// this helps us provide the credentials only to those auth
/// backends which require them for the authentication process.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum BackendType<T> {
/// Legacy Cloud API (V1) + link auth.
LegacyConsole(T),

View File

@@ -13,11 +13,21 @@ use std::future::Future;
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite};
const REQUEST_FAILED: &str = "Console request failed";
pub type Result<T> = std::result::Result<T, ConsoleAuthError>;
#[derive(Debug, Error)]
pub enum TransportError {
#[error("Console responded with a malformed JSON: {0}")]
pub enum ConsoleAuthError {
#[error(transparent)]
BadProjectName(#[from] auth::credentials::ClientCredsParseError),
// We shouldn't include the actual secret here.
#[error("Bad authentication secret")]
BadSecret,
#[error("Console responded with a malformed compute address: '{0}'")]
BadComputeAddress(String),
#[error("Console responded with a malformed JSON: '{0}'")]
BadResponse(#[from] serde_json::Error),
/// HTTP status (other than 200) returned by the console.
@@ -28,72 +38,19 @@ pub enum TransportError {
Io(#[from] std::io::Error),
}
impl UserFacingError for TransportError {
impl UserFacingError for ConsoleAuthError {
fn to_string_client(&self) -> String {
use TransportError::*;
use ConsoleAuthError::*;
match self {
HttpStatus(_) => self.to_string(),
_ => REQUEST_FAILED.to_owned(),
BadProjectName(e) => e.to_string_client(),
_ => "Internal error".to_string(),
}
}
}
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
impl From<reqwest::Error> for TransportError {
fn from(e: reqwest::Error) -> Self {
io_error(e).into()
}
}
#[derive(Debug, Error)]
pub enum GetAuthInfoError {
// We shouldn't include the actual secret here.
#[error("Console responded with a malformed auth secret")]
BadSecret,
#[error(transparent)]
Transport(TransportError),
}
impl UserFacingError for GetAuthInfoError {
fn to_string_client(&self) -> String {
use GetAuthInfoError::*;
match self {
BadSecret => REQUEST_FAILED.to_owned(),
Transport(e) => e.to_string_client(),
}
}
}
impl<E: Into<TransportError>> From<E> for GetAuthInfoError {
fn from(e: E) -> Self {
Self::Transport(e.into())
}
}
#[derive(Debug, Error)]
pub enum WakeComputeError {
// We shouldn't show users the address even if it's broken.
#[error("Console responded with a malformed compute address: {0}")]
BadComputeAddress(String),
#[error(transparent)]
Transport(TransportError),
}
impl UserFacingError for WakeComputeError {
fn to_string_client(&self) -> String {
use WakeComputeError::*;
match self {
BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
Transport(e) => e.to_string_client(),
}
}
}
impl<E: Into<TransportError>> From<E> for WakeComputeError {
fn from(e: E) -> Self {
Self::Transport(e.into())
impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError {
fn from(e: &auth::credentials::ClientCredsParseError) -> Self {
ConsoleAuthError::BadProjectName(e.clone())
}
}
@@ -138,7 +95,7 @@ impl<'a> Api<'a> {
handle_user(client, &self, Self::get_auth_info, Self::wake_compute).await
}
async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
async fn get_auth_info(&self) -> Result<AuthInfo> {
let mut url = self.endpoint.clone();
url.path_segments_mut().push("proxy_get_role_secret");
url.query_pairs_mut()
@@ -148,20 +105,21 @@ impl<'a> Api<'a> {
// TODO: use a proper logger
println!("cplane request: {url}");
let resp = reqwest::get(url.into_inner()).await?;
let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?;
if !resp.status().is_success() {
return Err(TransportError::HttpStatus(resp.status()).into());
return Err(ConsoleAuthError::HttpStatus(resp.status()));
}
let response: GetRoleSecretResponse = serde_json::from_str(&resp.text().await?)?;
let response: GetRoleSecretResponse =
serde_json::from_str(&resp.text().await.map_err(io_error)?)?;
scram::ServerSecret::parse(&response.role_secret)
scram::ServerSecret::parse(response.role_secret.as_str())
.map(AuthInfo::Scram)
.ok_or(GetAuthInfoError::BadSecret)
.ok_or(ConsoleAuthError::BadSecret)
}
/// Wake up the compute node and return the corresponding connection info.
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg> {
let mut url = self.endpoint.clone();
url.path_segments_mut().push("proxy_wake_compute");
url.query_pairs_mut()
@@ -170,16 +128,17 @@ impl<'a> Api<'a> {
// TODO: use a proper logger
println!("cplane request: {url}");
let resp = reqwest::get(url.into_inner()).await?;
let resp = reqwest::get(url.into_inner()).await.map_err(io_error)?;
if !resp.status().is_success() {
return Err(TransportError::HttpStatus(resp.status()).into());
return Err(ConsoleAuthError::HttpStatus(resp.status()));
}
let response: GetWakeComputeResponse = serde_json::from_str(&resp.text().await?)?;
let response: GetWakeComputeResponse =
serde_json::from_str(&resp.text().await.map_err(io_error)?)?;
// Unfortunately, ownership won't let us use `Option::ok_or` here.
let (host, port) = match parse_host_port(&response.address) {
None => return Err(WakeComputeError::BadComputeAddress(response.address)),
None => return Err(ConsoleAuthError::BadComputeAddress(response.address)),
Some(x) => x,
};
@@ -203,8 +162,8 @@ pub(super) async fn handle_user<'a, Endpoint, GetAuthInfo, WakeCompute>(
wake_compute: impl FnOnce(&'a Endpoint) -> WakeCompute,
) -> auth::Result<compute::NodeInfo>
where
GetAuthInfo: Future<Output = Result<AuthInfo, GetAuthInfoError>>,
WakeCompute: Future<Output = Result<ComputeConnCfg, WakeComputeError>>,
GetAuthInfo: Future<Output = Result<AuthInfo>>,
WakeCompute: Future<Output = Result<ComputeConnCfg>>,
{
let auth_info = get_auth_info(endpoint).await?;
@@ -212,7 +171,7 @@ where
let scram_keys = match auth_info {
AuthInfo::Md5(_) => {
// TODO: decide if we should support MD5 in api v2
return Err(auth::AuthError::bad_auth_method("MD5"));
return Err(auth::AuthErrorImpl::auth_failed("MD5 is not supported").into());
}
AuthInfo::Scram(secret) => {
let scram = auth::Scram(&secret);

View File

@@ -14,7 +14,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
use utils::pq_proto::BeMessage as Be;
#[derive(Debug, Error)]
pub enum LegacyAuthError {
pub enum AuthErrorImpl {
/// Authentication error reported by the console.
#[error("Authentication failed: {0}")]
AuthFailed(String),
@@ -24,7 +24,7 @@ pub enum LegacyAuthError {
HttpStatus(reqwest::StatusCode),
#[error("Console responded with a malformed JSON: {0}")]
BadResponse(#[from] serde_json::Error),
MalformedResponse(#[from] serde_json::Error),
#[error(transparent)]
Transport(#[from] reqwest::Error),
@@ -36,10 +36,30 @@ pub enum LegacyAuthError {
WaiterWait(#[from] waiters::WaitError),
}
impl UserFacingError for LegacyAuthError {
#[derive(Debug, Error)]
#[error(transparent)]
pub struct AuthError(Box<AuthErrorImpl>);
impl AuthError {
/// Smart constructor for authentication error reported by `mgmt`.
pub fn auth_failed(msg: impl Into<String>) -> Self {
Self(Box::new(AuthErrorImpl::AuthFailed(msg.into())))
}
}
impl<T> From<T> for AuthError
where
AuthErrorImpl: From<T>,
{
fn from(e: T) -> Self {
Self(Box::new(e.into()))
}
}
impl UserFacingError for AuthError {
fn to_string_client(&self) -> String {
use LegacyAuthError::*;
match self {
use AuthErrorImpl::*;
match self.0.as_ref() {
AuthFailed(_) | HttpStatus(_) => self.to_string(),
_ => "Internal error".to_string(),
}
@@ -68,7 +88,7 @@ async fn authenticate_proxy_client(
md5_response: &str,
salt: &[u8; 4],
psql_session_id: &str,
) -> Result<DatabaseInfo, LegacyAuthError> {
) -> Result<DatabaseInfo, AuthError> {
let mut url = auth_endpoint.clone();
url.query_pairs_mut()
.append_pair("login", &creds.user)
@@ -82,17 +102,17 @@ async fn authenticate_proxy_client(
// TODO: leverage `reqwest::Client` to reuse connections
let resp = reqwest::get(url).await?;
if !resp.status().is_success() {
return Err(LegacyAuthError::HttpStatus(resp.status()));
return Err(AuthErrorImpl::HttpStatus(resp.status()).into());
}
let auth_info = serde_json::from_str(resp.text().await?.as_str())?;
let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text().await?.as_str())?;
println!("got auth info: {:?}", auth_info);
use ProxyAuthResponse::*;
let db_info = match auth_info {
Ready { conn_info } => conn_info,
Error { error } => return Err(LegacyAuthError::AuthFailed(error)),
NotReady { .. } => waiter.await?.map_err(LegacyAuthError::AuthFailed)?,
Error { error } => return Err(AuthErrorImpl::AuthFailed(error).into()),
NotReady { .. } => waiter.await?.map_err(AuthErrorImpl::AuthFailed)?,
};
Ok(db_info)
@@ -104,7 +124,7 @@ async fn handle_existing_user(
auth_endpoint: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
creds: &ClientCredentials,
) -> auth::Result<compute::NodeInfo> {
) -> Result<compute::NodeInfo, auth::AuthError> {
let psql_session_id = super::link::new_psql_session_id();
let md5_salt = rand::random();

View File

@@ -1,34 +1,7 @@
use crate::{auth, compute, error::UserFacingError, stream::PqStream, waiters};
use thiserror::Error;
use crate::{auth, compute, stream::PqStream};
use tokio::io::{AsyncRead, AsyncWrite};
use utils::pq_proto::{BeMessage as Be, BeParameterStatusMessage};
#[derive(Debug, Error)]
pub enum LinkAuthError {
/// Authentication error reported by the console.
#[error("Authentication failed: {0}")]
AuthFailed(String),
#[error(transparent)]
WaiterRegister(#[from] waiters::RegisterError),
#[error(transparent)]
WaiterWait(#[from] waiters::WaitError),
#[error(transparent)]
Io(#[from] std::io::Error),
}
impl UserFacingError for LinkAuthError {
fn to_string_client(&self) -> String {
use LinkAuthError::*;
match self {
AuthFailed(_) => self.to_string(),
_ => "Internal error".to_string(),
}
}
}
fn hello_message(redirect_uri: &str, session_id: &str) -> String {
format!(
concat![
@@ -61,7 +34,7 @@ pub async fn handle_user(
.await?;
// Wait for web console response (see `mgmt`)
waiter.await?.map_err(LinkAuthError::AuthFailed)
waiter.await?.map_err(auth::AuthErrorImpl::auth_failed)
})
.await?;

View File

@@ -3,7 +3,7 @@
use crate::{
auth::{
self,
backend::console::{self, AuthInfo, GetAuthInfoError, TransportError, WakeComputeError},
backend::console::{self, AuthInfo, Result},
ClientCredentials,
},
compute::{self, ComputeConnCfg},
@@ -20,13 +20,6 @@ pub(super) struct Api<'a> {
creds: &'a ClientCredentials,
}
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
impl From<tokio_postgres::Error> for TransportError {
fn from(e: tokio_postgres::Error) -> Self {
io_error(e).into()
}
}
impl<'a> Api<'a> {
/// Construct an API object containing the auth parameters.
pub(super) fn new(endpoint: &'a ApiUrl, creds: &'a ClientCredentials) -> Self {
@@ -43,16 +36,21 @@ impl<'a> Api<'a> {
}
/// This implementation fetches the auth info from a local postgres instance.
async fn get_auth_info(&self) -> Result<AuthInfo, GetAuthInfoError> {
async fn get_auth_info(&self) -> Result<AuthInfo> {
// Perhaps we could persist this connection, but then we'd have to
// write more code for reopening it if it got closed, which doesn't
// seem worth it.
let (client, connection) =
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls)
.await
.map_err(io_error)?;
tokio::spawn(connection);
let query = "select rolpassword from pg_catalog.pg_authid where rolname = $1";
let rows = client.query(query, &[&self.creds.user]).await?;
let rows = client
.query(query, &[&self.creds.user])
.await
.map_err(io_error)?;
match &rows[..] {
// We can't get a secret if there's no such user.
@@ -76,13 +74,13 @@ impl<'a> Api<'a> {
}))
})
// Putting the secret into this message is a security hazard!
.ok_or(GetAuthInfoError::BadSecret)
.ok_or(console::ConsoleAuthError::BadSecret)
}
}
}
/// We don't need to wake anything locally, so we just return the connection info.
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg, WakeComputeError> {
pub(super) async fn wake_compute(&self) -> Result<ComputeConnCfg> {
let mut config = ComputeConnCfg::new();
config
.host(self.endpoint.host_str().unwrap_or("localhost"))

View File

@@ -75,12 +75,13 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
.strip_suffix(&[0])
.ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
let payload = PasswordHackPayload::parse(password)
// If we ended up here and the payload is malformed, it means that
// the user neither enabled SNI nor resorted to any other method
// for passing the project name we rely on. We should show them
// the most helpful error message and point to the documentation.
.ok_or(AuthErrorImpl::MissingProjectName)?;
// The so-called "password" should contain a base64-encoded json.
// We will use it later to route the client to their project.
let bytes = base64::decode(password)
.map_err(|_| AuthErrorImpl::MalformedPassword("bad encoding"))?;
let payload = serde_json::from_slice(&bytes)
.map_err(|_| AuthErrorImpl::MalformedPassword("invalid payload"))?;
Ok(payload)
}
@@ -97,7 +98,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
// Currently, the only supported SASL method is SCRAM.
if !scram::METHODS.contains(&sasl.method) {
return Err(super::AuthError::bad_auth_method(sasl.method));
return Err(AuthErrorImpl::auth_failed("method not supported").into());
}
let secret = self.state.0;

View File

@@ -1,46 +1,102 @@
//! Payload for ad hoc authentication method for clients that don't support SNI.
//! See the `impl` for [`super::backend::BackendType<ClientCredentials>`].
//! Read more: <https://github.com/neondatabase/cloud/issues/1620#issuecomment-1165332290>.
//! UPDATE (Mon Aug 8 13:20:34 UTC 2022): the payload format has been simplified.
use bstr::ByteSlice;
use serde::{de, Deserialize, Deserializer};
use std::fmt;
pub struct PasswordHackPayload {
pub project: String,
pub password: Vec<u8>,
#[derive(Deserialize)]
#[serde(untagged)]
pub enum Password {
/// A regular string for utf-8 encoded passwords.
Simple { password: String },
/// Password is base64-encoded because it may contain arbitrary byte sequences.
Encoded {
#[serde(rename = "password_", deserialize_with = "deserialize_base64")]
password: Vec<u8>,
},
}
impl PasswordHackPayload {
pub fn parse(bytes: &[u8]) -> Option<Self> {
// The format is `project=<utf-8>;<password-bytes>`.
let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";");
let project = iter.next()?.to_str().ok()?.to_owned();
let password = iter.next()?.to_owned();
Some(Self { project, password })
impl AsRef<[u8]> for Password {
fn as_ref(&self) -> &[u8] {
match self {
Password::Simple { password } => password.as_ref(),
Password::Encoded { password } => password.as_ref(),
}
}
}
#[derive(Deserialize)]
pub struct PasswordHackPayload {
pub project: String,
#[serde(flatten)]
pub password: Password,
}
fn deserialize_base64<'a, D: Deserializer<'a>>(des: D) -> Result<Vec<u8>, D::Error> {
// It's very tempting to replace this with
//
// ```
// let base64: &str = Deserialize::deserialize(des)?;
// base64::decode(base64).map_err(serde::de::Error::custom)
// ```
//
// Unfortunately, we can't always deserialize into `&str`, so we'd
// have to use an allocating `String` instead. Thus, visitor is better.
struct Visitor;
impl<'de> de::Visitor<'de> for Visitor {
type Value = Vec<u8>;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
formatter.write_str("a string")
}
fn visit_str<E: de::Error>(self, v: &str) -> Result<Self::Value, E> {
base64::decode(v).map_err(de::Error::custom)
}
}
des.deserialize_str(Visitor)
}
#[cfg(test)]
mod tests {
use super::*;
use rstest::rstest;
use serde_json::json;
#[test]
fn parse_password_hack_payload() {
let bytes = b"";
assert!(PasswordHackPayload::parse(bytes).is_none());
fn parse_password() -> anyhow::Result<()> {
let password: Password = serde_json::from_value(json!({
"password": "foo",
}))?;
assert_eq!(password.as_ref(), "foo".as_bytes());
let bytes = b"project=";
assert!(PasswordHackPayload::parse(bytes).is_none());
let password: Password = serde_json::from_value(json!({
"password_": base64::encode("foo"),
}))?;
assert_eq!(password.as_ref(), "foo".as_bytes());
let bytes = b"project=;";
let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
assert_eq!(payload.project, "");
assert_eq!(payload.password, b"");
Ok(())
}
let bytes = b"project=foobar;pass;word";
let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
assert_eq!(payload.project, "foobar");
assert_eq!(payload.password, b"pass;word");
#[rstest]
#[case("password", str::to_owned)]
#[case("password_", base64::encode)]
fn parse(#[case] key: &str, #[case] encode: fn(&'static str) -> String) -> anyhow::Result<()> {
let (password, project) = ("password", "pie-in-the-sky");
let payload = json!({
"project": project,
key: encode(password),
});
let payload: PasswordHackPayload = serde_json::from_value(payload)?;
assert_eq!(payload.password.as_ref(), password.as_bytes());
assert_eq!(payload.project, project);
Ok(())
}
}

View File

@@ -65,17 +65,8 @@ impl NodeInfo {
// require for our business.
let mut connection_error = None;
let ports = self.config.get_ports();
let hosts = self.config.get_hosts();
// the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array
if ports.len() > 1 && ports.len() != hosts.len() {
return Err(io::Error::new(
io::ErrorKind::Other,
format!("couldn't connect: bad compute config, ports and hosts entries' count does not match: {:?}", self.config),
));
}
for (i, host) in hosts.iter().enumerate() {
let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432);
for (i, host) in self.config.get_hosts().iter().enumerate() {
let port = ports.get(i).or_else(|| ports.get(0)).unwrap_or(&5432);
let host = match host {
Host::Tcp(host) => host.as_str(),
Host::Unix(_) => continue, // unix sockets are not welcome here

View File

@@ -14,7 +14,7 @@ pub const SCRAM_RAW_NONCE_LEN: usize = 18;
fn validate_sasl_extensions<'a>(parts: impl Iterator<Item = &'a str>) -> Option<()> {
for mut chars in parts.map(|s| s.chars()) {
let attr = chars.next()?;
if !('a'..='z').contains(&attr) && !('A'..='Z').contains(&attr) {
if !('a'..'z').contains(&attr) && !('A'..'Z').contains(&attr) {
return None;
}
let eq = chars.next()?;

View File

@@ -26,7 +26,6 @@ pytest-lazy-fixture = "^0.6.3"
prometheus-client = "^0.14.1"
pytest-timeout = "^2.1.0"
Werkzeug = "2.1.2"
pytest-order = "^1.0.1"
[tool.poetry.dev-dependencies]
yapf = "==0.31.0"

View File

@@ -40,7 +40,7 @@ struct SafeKeeperStateV1 {
wal_start_lsn: Lsn,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ServerInfoV2 {
/// Postgres server version
pub pg_version: u32,
@@ -70,7 +70,7 @@ pub struct SafeKeeperStateV2 {
pub wal_start_lsn: Lsn,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ServerInfoV3 {
/// Postgres server version
pub pg_version: u32,

View File

@@ -127,7 +127,7 @@ impl AcceptorState {
/// Information about Postgres. Safekeeper gets it once and then verifies
/// all further connections from computes match.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ServerInfo {
/// Postgres server version
pub pg_version: u32,

View File

@@ -36,7 +36,7 @@ const NEON_STATUS_UPDATE_TAG_BYTE: u8 = b'z';
type FullTransactionId = u64;
/// Hot standby feedback received from replica
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct HotStandbyFeedback {
pub ts: TimestampTz,
pub xmin: FullTransactionId,

View File

@@ -332,7 +332,7 @@ impl Storage for PhysicalStorage {
self.write_lsn = if state.commit_lsn == Lsn(0) {
Lsn(0)
} else {
find_end_of_wal(&self.timeline_dir, wal_seg_size, state.commit_lsn)?
Lsn(find_end_of_wal(&self.timeline_dir, wal_seg_size, true, state.commit_lsn)?.0)
};
self.write_record_lsn = self.write_lsn;

View File

@@ -18,10 +18,6 @@ exclude = ^vendor/
# some tests don't typecheck when this flag is set
check_untyped_defs = false
# Help mypy find imports when running against list of individual files.
# Without this line it would behave differently when executed on the entire project.
mypy_path = $MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner
disallow_incomplete_defs = false
disallow_untyped_calls = false
disallow_untyped_decorators = false

View File

@@ -120,7 +120,10 @@ def test_import_from_pageserver_small(pg_bin: PgBin, neon_env_builder: NeonEnvBu
@pytest.mark.timeout(1800)
@pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build")
# TODO: temporarily disable `test_import_from_pageserver_multisegment` test, enable
# the test back after finding the failure cause.
# @pytest.mark.skipif(os.environ.get('BUILD_TYPE') == "debug", reason="only run with release build")
@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/2255")
def test_import_from_pageserver_multisegment(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 1
neon_env_builder.enable_local_fs_remote_storage()

View File

@@ -2,6 +2,16 @@ from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.log_helper import log
# Test that the pageserver fixture is implemented correctly, allowing quick restarts.
# This is a regression test, see https://github.com/neondatabase/neon/issues/2247
def test_fixture_restart(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
for i in range(3):
env.pageserver.stop()
env.pageserver.start()
# Test restarting page server, while safekeeper and compute node keep
# running.
def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):

View File

@@ -1,5 +1,6 @@
import pytest
import psycopg2
import json
import base64
def test_proxy_select_1(static_proxy):
@@ -12,14 +13,22 @@ def test_password_hack(static_proxy):
static_proxy.safe_psql(f"create role {user} with login password '{password}'",
options='project=irrelevant')
# Note the format of `magic`!
magic = f"project=irrelevant;{password}"
def encode(s: str) -> str:
return base64.b64encode(s.encode('utf-8')).decode('utf-8')
magic = encode(json.dumps({
'project': 'irrelevant',
'password': password,
}))
static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic)
# Must also check that invalid magic won't be accepted.
with pytest.raises(psycopg2.errors.OperationalError):
magic = "broken"
static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic)
magic = encode(json.dumps({
'project': 'irrelevant',
'password_': encode(password),
}))
static_proxy.safe_psql('select 1', sslsni=0, user=user, password=magic)
# Pass extra options to the server.

View File

@@ -1,17 +0,0 @@
"""Tests for the code in test fixtures"""
from fixtures.neon_fixtures import NeonEnvBuilder
# Test that pageserver and safekeeper can restart quickly.
# This is a regression test, see https://github.com/neondatabase/neon/issues/2247
def test_fixture_restart(neon_env_builder: NeonEnvBuilder):
env = neon_env_builder.init_start()
for i in range(3):
env.pageserver.stop()
env.pageserver.start()
for i in range(3):
env.safekeepers[0].stop()
env.safekeepers[0].start()

View File

@@ -1090,9 +1090,11 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
# Remove initial tenant fully (two branches are active)
response = sk_http.tenant_delete_force(tenant_id)
assert response[timeline_id_3] == {
"dir_existed": True,
"was_active": True,
assert response == {
timeline_id_3: {
"dir_existed": True,
"was_active": True,
}
}
assert not (sk_data_dir / tenant_id).exists()
assert (sk_data_dir / tenant_id_other / timeline_id_other).is_dir()

View File

@@ -520,68 +520,3 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder):
pg = env.postgres.create_start('test_safekeepers_race_conditions')
asyncio.run(run_race_conditions(env, pg))
# Check that pageserver can select safekeeper with largest commit_lsn
# and switch if LSN is not updated for some time (NoWalTimeout).
async def run_wal_lagging(env: NeonEnv, pg: Postgres):
def safekeepers_guc(env: NeonEnv, active_sk: List[bool]) -> str:
# use ports 10, 11 and 12 to simulate unavailable safekeepers
return ','.join([
f'localhost:{sk.port.pg if active else 10 + i}'
for i, (sk, active) in enumerate(zip(env.safekeepers, active_sk))
])
conn = await pg.connect_async()
await conn.execute('CREATE TABLE t(key int primary key, value text)')
await conn.close()
pg.stop()
n_iterations = 20
n_txes = 10000
expected_sum = 0
i = 1
quorum = len(env.safekeepers) // 2 + 1
for it in range(n_iterations):
active_sk = list(map(lambda _: random.random() >= 0.5, env.safekeepers))
active_count = sum(active_sk)
if active_count < quorum:
it -= 1
continue
pg.adjust_for_safekeepers(safekeepers_guc(env, active_sk))
log.info(f'Iteration {it}: {active_sk}')
pg.start()
conn = await pg.connect_async()
for _ in range(n_txes):
await conn.execute(f"INSERT INTO t values ({i}, 'payload')")
expected_sum += i
i += 1
await conn.close()
pg.stop()
pg.adjust_for_safekeepers(safekeepers_guc(env, [True] * len(env.safekeepers)))
pg.start()
conn = await pg.connect_async()
log.info(f'Executed {i-1} queries')
res = await conn.fetchval('SELECT sum(key) FROM t')
assert res == expected_sum
# do inserts while restarting postgres and messing with safekeeper addresses
def test_wal_lagging(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
env.neon_cli.create_branch('test_wal_lagging')
pg = env.postgres.create_start('test_wal_lagging')
asyncio.run(run_wal_lagging(env, pg))

View File

@@ -1,21 +1,23 @@
import calendar
import dataclasses
import enum
import json
import os
import re
import timeit
import uuid
import warnings
from contextlib import contextmanager
from datetime import datetime
from pathlib import Path
# Type-related stuff
from typing import Iterator, Optional
import re
import subprocess
import timeit
import calendar
import enum
from datetime import datetime
import uuid
import pytest
from _pytest.config import Config
from _pytest.terminal import TerminalReporter
import warnings
from contextlib import contextmanager
# Type-related stuff
from typing import Iterator, Optional
"""
This file contains fixtures for micro-benchmarks.
@@ -75,7 +77,7 @@ class PgBenchRunResult:
# we know significant parts of these values from test input
# but to be precise take them from output
for line in stdout_lines:
for line in stdout.splitlines():
# scaling factor: 5
if line.startswith("scaling factor:"):
scale = int(line.split()[-1])
@@ -129,58 +131,6 @@ class PgBenchRunResult:
)
@dataclasses.dataclass
class PgBenchInitResult:
total: float
drop_tables: Optional[float]
create_tables: Optional[float]
client_side_generate: Optional[float]
vacuum: Optional[float]
primary_keys: Optional[float]
duration: float
start_timestamp: int
end_timestamp: int
@classmethod
def parse_from_stderr(
cls,
stderr: str,
duration: float,
start_timestamp: int,
end_timestamp: int,
):
# Parses pgbench initialize output for default initialization steps (dtgvp)
# Example: done in 5.66 s (drop tables 0.05 s, create tables 0.31 s, client-side generate 2.01 s, vacuum 0.53 s, primary keys 0.38 s).
last_line = stderr.splitlines()[-1]
regex = re.compile(r"done in (\d+\.\d+) s "
r"\("
r"(?:drop tables (\d+\.\d+) s)?(?:, )?"
r"(?:create tables (\d+\.\d+) s)?(?:, )?"
r"(?:client-side generate (\d+\.\d+) s)?(?:, )?"
r"(?:vacuum (\d+\.\d+) s)?(?:, )?"
r"(?:primary keys (\d+\.\d+) s)?(?:, )?"
r"\)\.")
if (m := regex.match(last_line)) is not None:
total, drop_tables, create_tables, client_side_generate, vacuum, primary_keys = [float(v) for v in m.groups() if v is not None]
else:
raise RuntimeError(f"can't parse pgbench initialize results from `{last_line}`")
return cls(
total=total,
drop_tables=drop_tables,
create_tables=create_tables,
client_side_generate=client_side_generate,
vacuum=vacuum,
primary_keys=primary_keys,
duration=duration,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
)
@enum.unique
class MetricReport(str, enum.Enum): # str is a hack to make it json serializable
# this means that this is a constant test parameter
@@ -282,32 +232,6 @@ class NeonBenchmarker:
'',
MetricReport.TEST_PARAM)
def record_pg_bench_init_result(self, prefix: str, result: PgBenchInitResult):
test_params = [
"start_timestamp",
"end_timestamp",
]
for test_param in test_params:
self.record(f"{prefix}.{test_param}",
getattr(result, test_param),
'',
MetricReport.TEST_PARAM)
metrics = [
"duration",
"drop_tables",
"create_tables",
"client_side_generate",
"vacuum",
"primary_keys",
]
for metric in metrics:
if (value := getattr(result, metric)) is not None:
self.record(f"{prefix}.{metric}",
value,
unit="s",
report=MetricReport.LOWER_IS_BETTER)
def get_io_writes(self, pageserver) -> int:
"""
Fetch the "cumulative # of bytes written" metric from the pageserver

View File

@@ -1488,6 +1488,17 @@ class NeonPageserver(PgProtocol):
self.running = True
return self
def _wait_for_death(self):
"""Wait for pageserver to die. Assumes kill signal is sent."""
pid_path = pathlib.Path(self.env.repo_dir) / "pageserver.pid"
pid = read_pid(pid_path)
retries_left = 20
while check_pid(pid):
time.sleep(0.2)
retries_left -= 1
if retries_left == 0:
raise AssertionError("Pageserver failed to die")
def stop(self, immediate=False) -> 'NeonPageserver':
"""
Stop the page server.
@@ -1495,6 +1506,7 @@ class NeonPageserver(PgProtocol):
"""
if self.running:
self.env.neon_cli.pageserver_stop(immediate)
self._wait_for_death()
self.running = False
return self
@@ -2004,6 +2016,17 @@ def read_pid(path: Path) -> int:
return int(path.read_text())
def check_pid(pid):
"""Check whether pid is running."""
try:
# If sig is 0, then no signal is sent, but error checking is still performed.
os.kill(pid, 0)
except OSError:
return False
else:
return True
@dataclass
class SafekeeperPort:
pg: int
@@ -2440,7 +2463,7 @@ def wait_for_upload(pageserver_http_client: NeonPageserverHttpClient,
timeline: uuid.UUID,
lsn: int):
"""waits for local timeline upload up to specified lsn"""
for i in range(20):
for i in range(10):
current_lsn = remote_consistent_lsn(pageserver_http_client, tenant, timeline)
if current_lsn >= lsn:
return

View File

@@ -32,16 +32,10 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
stdout_filename = basepath + '.stdout'
stderr_filename = basepath + '.stderr'
try:
with open(stdout_filename, 'w') as stdout_f:
with open(stderr_filename, 'w') as stderr_f:
log.info(f'Capturing stdout to "{base}.stdout" and stderr to "{base}.stderr"')
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
finally:
# Remove empty files if there is no output
for filename in (stdout_filename, stderr_filename):
if os.stat(filename).st_size == 0:
os.remove(filename)
with open(stdout_filename, 'w') as stdout_f:
with open(stderr_filename, 'w') as stderr_f:
log.info('(capturing output to "{}.stdout")'.format(base))
subprocess.run(cmd, **kwargs, stdout=stdout_f, stderr=stderr_f)
return basepath
@@ -146,12 +140,3 @@ def parse_delta_layer(f_name: str) -> Tuple[int, int, int, int]:
key_parts = parts[0].split("-")
lsn_parts = parts[1].split("-")
return int(key_parts[0], 16), int(key_parts[1], 16), int(lsn_parts[0], 16), int(lsn_parts[1], 16)
def get_scale_for_db(size_mb: int) -> int:
"""Returns pgbench scale factor for given target db size in MB.
Ref https://www.cybertec-postgresql.com/en/a-formula-to-calculate-pgbench-scaling-factor-for-target-db-size/
"""
return round(0.06689 * size_mb - 0.5)

View File

@@ -10,7 +10,7 @@ In the CI, the performance tests are run in the same environment as the other in
## Remote tests
There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests on our staging and captest environments daily. Those are not an isolated environments, so there can be noise in the results due to activity of other clusters.
There are a few tests that marked with `pytest.mark.remote_cluster`. These tests do not set up a local environment, and instead require a libpq connection string to connect to. So they can be run on any Postgres compatible database. Currently, the CI runs these tests our staging environment daily. Staging is not an isolated environment, so there can be noise in the results due to activity of other clusters.
## Noise

View File

@@ -1,23 +1,17 @@
import calendar
import enum
import os
import timeit
from datetime import datetime
from contextlib import closing
from fixtures.neon_fixtures import PgBin, VanillaPostgres, NeonEnv, profiling_supported
from fixtures.compare_fixtures import PgCompare, VanillaCompare, NeonCompare
from fixtures.benchmark_fixture import PgBenchRunResult, MetricReport, NeonBenchmarker
from fixtures.log_helper import log
from pathlib import Path
from typing import List
import pytest
from fixtures.benchmark_fixture import MetricReport, PgBenchInitResult, PgBenchRunResult
from fixtures.compare_fixtures import NeonCompare, PgCompare
from fixtures.neon_fixtures import profiling_supported
from fixtures.utils import get_scale_for_db
@enum.unique
class PgBenchLoadType(enum.Enum):
INIT = "init"
SIMPLE_UPDATE = "simple_update"
SELECT_ONLY = "select-only"
from datetime import datetime
import calendar
import os
import timeit
def utc_now_timestamp() -> int:
@@ -28,24 +22,23 @@ def init_pgbench(env: PgCompare, cmdline):
# calculate timestamps and durations separately
# timestamp is intended to be used for linking to grafana and logs
# duration is actually a metric and uses float instead of int for timestamp
start_timestamp = utc_now_timestamp()
init_start_timestamp = utc_now_timestamp()
t0 = timeit.default_timer()
with env.record_pageserver_writes('init.pageserver_writes'):
out = env.pg_bin.run_capture(cmdline)
env.pg_bin.run_capture(cmdline)
env.flush()
init_duration = timeit.default_timer() - t0
init_end_timestamp = utc_now_timestamp()
duration = timeit.default_timer() - t0
end_timestamp = utc_now_timestamp()
stderr = Path(f"{out}.stderr").read_text()
res = PgBenchInitResult.parse_from_stderr(
stderr=stderr,
duration=duration,
start_timestamp=start_timestamp,
end_timestamp=end_timestamp,
)
env.zenbenchmark.record_pg_bench_init_result("init", res)
env.zenbenchmark.record("init.duration",
init_duration,
unit="s",
report=MetricReport.LOWER_IS_BETTER)
env.zenbenchmark.record("init.start_timestamp",
init_start_timestamp,
'',
MetricReport.TEST_PARAM)
env.zenbenchmark.record("init.end_timestamp", init_end_timestamp, '', MetricReport.TEST_PARAM)
def run_pgbench(env: PgCompare, prefix: str, cmdline):
@@ -77,84 +70,38 @@ def run_pgbench(env: PgCompare, prefix: str, cmdline):
# the test database.
#
# Currently, the # of connections is hardcoded at 4
def run_test_pgbench(env: PgCompare, scale: int, duration: int, workload_type: PgBenchLoadType):
def run_test_pgbench(env: PgCompare, scale: int, duration: int):
# Record the scale and initialize
env.zenbenchmark.record("scale", scale, '', MetricReport.TEST_PARAM)
init_pgbench(env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr()])
if workload_type == PgBenchLoadType.INIT:
# Run initialize
init_pgbench(
env, ['pgbench', f'-s{scale}', '-i', env.pg.connstr(options='-cstatement_timeout=1h')])
# Run simple-update workload
run_pgbench(env,
"simple-update", ['pgbench', '-N', '-c4', f'-T{duration}', '-P2', env.pg.connstr()])
if workload_type == PgBenchLoadType.SIMPLE_UPDATE:
# Run simple-update workload
run_pgbench(env,
"simple-update",
[
'pgbench',
'-N',
'-c4',
f'-T{duration}',
'-P2',
'--progress-timestamp',
env.pg.connstr(),
])
if workload_type == PgBenchLoadType.SELECT_ONLY:
# Run SELECT workload
run_pgbench(env,
"select-only",
[
'pgbench',
'-S',
'-c4',
f'-T{duration}',
'-P2',
'--progress-timestamp',
env.pg.connstr(),
])
# Run SELECT workload
run_pgbench(env,
"select-only", ['pgbench', '-S', '-c4', f'-T{duration}', '-P2', env.pg.connstr()])
env.report_size()
def get_durations_matrix(default: int = 45) -> List[int]:
def get_durations_matrix(default: int = 45):
durations = os.getenv("TEST_PG_BENCH_DURATIONS_MATRIX", default=str(default))
rv = []
for d in durations.split(","):
d = d.strip().lower()
if d.endswith('h'):
duration = int(d.removesuffix('h')) * 60 * 60
elif d.endswith('m'):
duration = int(d.removesuffix('m')) * 60
else:
duration = int(d.removesuffix('s'))
rv.append(duration)
return rv
return list(map(int, durations.split(",")))
def get_scales_matrix(default: int = 10) -> List[int]:
def get_scales_matrix(default: int = 10):
scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX", default=str(default))
rv = []
for s in scales.split(","):
s = s.strip().lower()
if s.endswith('mb'):
scale = get_scale_for_db(int(s.removesuffix('mb')))
elif s.endswith('gb'):
scale = get_scale_for_db(int(s.removesuffix('gb')) * 1024)
else:
scale = int(s)
rv.append(scale)
return rv
return list(map(int, scales.split(",")))
# Run the pgbench tests against vanilla Postgres and neon
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
def test_pgbench(neon_with_baseline: PgCompare, scale: int, duration: int):
run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.INIT)
run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
run_test_pgbench(neon_with_baseline, scale, duration, PgBenchLoadType.SELECT_ONLY)
run_test_pgbench(neon_with_baseline, scale, duration)
# Run the pgbench tests, and generate a flamegraph from it
@@ -176,34 +123,12 @@ profiling="page_requests"
env = neon_env_builder.init_start()
env.neon_cli.create_branch("empty", "main")
neon_compare = NeonCompare(zenbenchmark, env, pg_bin, "pgbench")
run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.INIT)
run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
run_test_pgbench(neon_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
run_test_pgbench(NeonCompare(zenbenchmark, env, pg_bin, "pgbench"), scale, duration)
# The following 3 tests run on an existing database as it was set up by previous tests,
# and leaves the database in a state that would be used in the next tests.
# Modifying the definition order of these functions or adding other remote tests in between will alter results.
# See usage of --sparse-ordering flag in the pytest invocation in the CI workflow
#
# Run the pgbench tests against an existing Postgres cluster
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
@pytest.mark.remote_cluster
def test_pgbench_remote_init(remote_compare: PgCompare, scale: int, duration: int):
run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.INIT)
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
@pytest.mark.remote_cluster
def test_pgbench_remote_simple_update(remote_compare: PgCompare, scale: int, duration: int):
run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SIMPLE_UPDATE)
@pytest.mark.parametrize("scale", get_scales_matrix())
@pytest.mark.parametrize("duration", get_durations_matrix())
@pytest.mark.remote_cluster
def test_pgbench_remote_select_only(remote_compare: PgCompare, scale: int, duration: int):
run_test_pgbench(remote_compare, scale, duration, PgBenchLoadType.SELECT_ONLY)
def test_pgbench_remote(remote_compare: PgCompare, scale: int, duration: int):
run_test_pgbench(remote_compare, scale, duration)

View File

@@ -3,10 +3,10 @@ import shutil
import subprocess
from pathlib import Path
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
import pytest
from fixtures.neon_fixtures import RemotePostgres
from fixtures.utils import subprocess_capture
@pytest.mark.remote_cluster
@@ -25,7 +25,7 @@ from fixtures.utils import subprocess_capture
"typescript/postgresql-client",
],
)
def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: str):
def test_pg_clients(remote_pg: RemotePostgres, client: str):
conn_options = remote_pg.conn_options()
env_file = None
@@ -43,10 +43,12 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st
if docker_bin is None:
raise RuntimeError("docker is required for running this test")
build_cmd = [docker_bin, "build", "--tag", image_tag, f"{Path(__file__).parent / client}"]
subprocess_capture(str(test_output_dir), build_cmd, check=True)
build_cmd = [
docker_bin, "build", "--quiet", "--tag", image_tag, f"{Path(__file__).parent / client}"
]
run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag]
basepath = subprocess_capture(str(test_output_dir), run_cmd, check=True)
assert Path(f"{basepath}.stdout").read_text().strip() == "1"
subprocess.run(build_cmd, check=True)
result = subprocess.run(run_cmd, check=True, capture_output=True, text=True)
assert result.stdout.strip() == "1"