Merge pull request #2377 from neondatabase/main

Release 2022-09-01
This commit is contained in:
Anastasia Lubennikova
2022-09-01 20:20:44 +03:00
committed by GitHub
215 changed files with 15242 additions and 5768 deletions

View File

@@ -1,18 +1,18 @@
**/.git/
**/__pycache__
**/.pytest_cache
*
.git
target
tmp_check
tmp_install
tmp_check_cli
test_output
.vscode
.neon
integration_tests/.neon
.mypy_cache
Dockerfile
.dockerignore
!Cargo.toml
!Cargo.lock
!Makefile
!.cargo/
!.config/
!control_plane/
!compute_tools/
!libs/
!pageserver/
!pgxn/
!proxy/
!safekeeper/
!vendor/postgres/
!workspace_hack/
!neon_local/

1
.git-blame-ignore-revs Normal file
View File

@@ -0,0 +1 @@
4c2bb43775947775401cbb9d774823c5723a91f8

23
.github/ISSUE_TEMPLATE/bug-template.md vendored Normal file
View File

@@ -0,0 +1,23 @@
---
name: Bug Template
about: Used for describing bugs
title: ''
labels: t/bug
assignees: ''
---
## Steps to reproduce
## Expected result
## Actual result
## Environment
## Logs, links
-

25
.github/ISSUE_TEMPLATE/epic-template.md vendored Normal file
View File

@@ -0,0 +1,25 @@
---
name: Epic Template
about: A set of related tasks contributing towards specific outcome, comprizing of
more than 1 week of work.
title: 'Epic: '
labels: t/Epic
assignees: ''
---
## Motivation
## DoD
## Implementation ideas
## Tasks
- [ ]
## Other related tasks and Epics
-

View File

@@ -0,0 +1,20 @@
## Release 202Y-MM-DD
**NB: this PR must be merged only by 'Create a merge commit'!**
### Checklist when preparing for release
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
### Checklist after release
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files))
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->

217
.github/actions/allure-report/action.yml vendored Normal file
View File

@@ -0,0 +1,217 @@
name: 'Create Allure report'
description: 'Create and publish Allure report'
inputs:
action:
desctiption: 'generate or store'
required: true
build_type:
description: '`build_type` from run-python-test-set action'
required: true
test_selection:
description: '`test_selector` from run-python-test-set action'
required: false
runs:
using: "composite"
steps:
- name: Validate input parameters
shell: bash -euxo pipefail {0}
run: |
if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
exit 1
fi
if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
echo 2>&1 "inputs.test_selection must be set for 'store' action"
exit 2
fi
- name: Calculate key
id: calculate-key
shell: bash -euxo pipefail {0}
run: |
# TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
if [ "${pr_number}" != "null" ]; then
key=pr-${pr_number}
elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then
# Shortcut for a special branch
key=main
else
key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
fi
echo "::set-output name=KEY::${key}"
- uses: actions/setup-java@v3
if: ${{ inputs.action == 'generate' }}
with:
distribution: 'temurin'
java-version: '17'
- name: Install Allure
if: ${{ inputs.action == 'generate' }}
shell: bash -euxo pipefail {0}
run: |
if ! which allure; then
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c
unzip -q ${ALLURE_ZIP}
echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
rm -f ${ALLURE_ZIP}
fi
env:
ALLURE_VERSION: 2.19.0
ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
- name: Upload Allure results
if: ${{ inputs.action == 'store' }}
env:
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
TEST_OUTPUT: /tmp/test_output
BUCKET: neon-github-public-dev
shell: bash -euxo pipefail {0}
run: |
# Add metadata
cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
{
"name": "GitHub Actions",
"type": "github",
"url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
"buildOrder": ${GITHUB_RUN_ID},
"buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
"buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
"reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
"reportName": "Allure Report"
}
EOF
cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
TEST_SELECTION=${{ inputs.test_selection }}
BUILD_TYPE=${{ inputs.build_type }}
EOF
ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
ZSTD_NBTHREADS=0
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
- name: Acquire Allure lock
if: ${{ inputs.action == 'generate' }}
shell: bash -euxo pipefail {0}
env:
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
BUCKET: neon-github-public-dev
run: |
LOCK_TIMEOUT=300 # seconds
for _ in $(seq 1 5); do
for i in $(seq 1 ${LOCK_TIMEOUT}); do
LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
# `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
break
fi
sleep 1
done
echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt
aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
# A double-check that exactly WE have acquired the lock
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
break
fi
done
- name: Generate and publish final Allure report
if: ${{ inputs.action == 'generate' }}
id: generate-report
env:
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
TEST_OUTPUT: /tmp/test_output
BUCKET: neon-github-public-dev
shell: bash -euxo pipefail {0}
run: |
# Get previously uploaded data for this run
ZSTD_NBTHREADS=0
s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key')
if [ -z "$s3_filepaths" ]; then
# There's no previously uploaded data for this run
exit 0
fi
for s3_filepath in ${s3_filepaths}; do
aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
mkdir -p ${archive%.tar.zst}
tar -xf ${archive} -C ${archive%.tar.zst}
rm -f ${archive}
done
# Get history trend
aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
# Generate report
allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
# Replace a logo link with a redirect to the latest version of the report
sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
# Generate redirect
cat <<EOF > ./index.html
<!DOCTYPE html>
<meta charset="utf-8">
<title>Redirecting to ${REPORT_URL}</title>
<meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
EOF
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
echo "::set-output name=REPORT_URL::${REPORT_URL}"
- name: Release Allure lock
if: ${{ inputs.action == 'generate' && always() }}
shell: bash -euxo pipefail {0}
env:
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
BUCKET: neon-github-public-dev
run: |
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
fi
- uses: actions/github-script@v6
if: ${{ inputs.action == 'generate' && always() }}
env:
REPORT_URL: ${{ steps.generate-report.outputs.REPORT_URL }}
BUILD_TYPE: ${{ inputs.build_type }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
script: |
const { REPORT_URL, BUILD_TYPE, SHA } = process.env
await github.rest.repos.createCommitStatus({
owner: context.repo.owner,
repo: context.repo.repo,
sha: `${SHA}`,
state: 'success',
target_url: `${REPORT_URL}`,
context: `Allure report / ${BUILD_TYPE}`,
})

View File

@@ -3,11 +3,11 @@ description: 'Runs a Neon python test set, performing all the required preparati
inputs:
build_type:
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster'
required: true
rust_toolchain:
description: 'Rust toolchain version to fetch the caches'
required: true
required: false
test_selection:
description: 'A python test suite to run'
required: true
@@ -24,7 +24,7 @@ inputs:
required: false
default: 'true'
save_perf_report:
description: 'Whether to upload the performance report'
description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set'
required: false
default: 'false'
run_with_real_s3:
@@ -52,6 +52,7 @@ runs:
using: "composite"
steps:
- name: Get Neon artifact
if: inputs.build_type != 'remote'
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
@@ -78,7 +79,6 @@ runs:
- name: Run pytest
env:
NEON_BIN: /tmp/neon/bin
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
TEST_OUTPUT: /tmp/test_output
# this variable will be embedded in perf test report
# and is needed to distinguish different environments
@@ -88,6 +88,12 @@ runs:
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
shell: bash -euxo pipefail {0}
run: |
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
if [ "${BUILD_TYPE}" = "remote" ]; then
export REMOTE_ENV=1
fi
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
rm -rf $PERF_REPORT_DIR
@@ -119,6 +125,13 @@ runs:
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
cov_prefix=()
else
cov_prefix=()
fi
# Wake up the cluster if we use remote neon instance
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
fi
# Run the tests.
@@ -131,11 +144,12 @@ runs:
# -n4 uses four processes to run tests via pytest-xdist
# -s is not used to prevent pytest from capturing output, because tests are running
# in parallel and logs are mixed between different tests
mkdir -p $TEST_OUTPUT/allure/results
"${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--alluredir=$TEST_OUTPUT/allure/results \
--tb=short \
--verbose \
-m "not remote_cluster" \
-rA $TEST_SELECTION $EXTRA_PARAMS
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
@@ -146,17 +160,10 @@ runs:
fi
fi
- name: Delete all data but logs
shell: bash -euxo pipefail {0}
- name: Create Allure report
if: always()
run: |
du -sh /tmp/test_output/*
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
du -sh /tmp/test_output/*
- name: Upload python test logs
if: always()
uses: ./.github/actions/upload
uses: ./.github/actions/allure-report
with:
name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
path: /tmp/test_output/
action: store
build_type: ${{ inputs.build_type }}
test_selection: ${{ inputs.test_selection }}

View File

@@ -106,7 +106,7 @@ jobs:
mkdir -p perf-report-staging
# Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
# it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 5400
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400
- name: Submit result
env:
@@ -128,9 +128,9 @@ jobs:
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
TEST_PG_BENCH_SCALES_MATRIX: "10gb"
REMOTE_ENV: "1"
POSTGRES_DISTRIB_DIR: /usr
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
strategy:
fail-fast: false
@@ -138,23 +138,15 @@ jobs:
connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ]
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2817580636
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
options: --init
timeout-minutes: 360 # 6h
steps:
- uses: actions/checkout@v3
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: Calculate platform
id: calculate-platform
env:
@@ -173,50 +165,56 @@ jobs:
- name: Install Deps
run: |
echo "deb http://apt.postgresql.org/pub/repos/apt focal-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
sudo apt -y update
sudo apt install -y postgresql-14 postgresql-client-14
sudo apt install -y postgresql-14
- name: Benchmark init
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
run: |
mkdir -p perf-report-captest
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
- name: Benchmark simple-update
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
run: |
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
- name: Benchmark select-only
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
run: |
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
- name: Submit result
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
run: |
REPORT_FROM=$(realpath perf-report-captest) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
- name: Upload logs
if: always()
uses: ./.github/actions/upload
- name: Benchmark simple-update
uses: ./.github/actions/run-python-test-set
with:
name: bench-captest-${{ steps.calculate-platform.outputs.PLATFORM }}
path: /tmp/test_output/
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Benchmark simple-update
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ env.BUILD_TYPE }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
env:
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
- name: Create Allure report
uses: ./.github/actions/allure-report
with:
action: generate
build_type: ${{ env.BUILD_TYPE }}
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}

View File

@@ -95,11 +95,11 @@ jobs:
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FEATURES=""
CARGO_FLAGS=""
CARGO_FLAGS="--locked"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FEATURES="--features profiling"
CARGO_FLAGS="--release $CARGO_FEATURES"
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
@@ -121,8 +121,8 @@ jobs:
target/
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
key: |
v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
- name: Cache postgres build
id: cache_pg
@@ -136,6 +136,10 @@ jobs:
run: mold -run make postgres -j$(nproc)
shell: bash -euxo pipefail {0}
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
shell: bash -euxo pipefail {0}
- name: Run cargo build
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
@@ -202,7 +206,7 @@ jobs:
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
pg_regress-tests:
regress-tests:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -220,42 +224,13 @@ jobs:
submodules: true
fetch-depth: 2
- name: Pytest regress tests
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ matrix.build_type }}
rust_toolchain: ${{ matrix.rust_toolchain }}
test_selection: batch_pg_regress
test_selection: regress
needs_postgres_source: true
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
other-tests:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ build-neon ]
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
rust_toolchain: [ 1.58 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 2
- name: Pytest other tests
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ matrix.build_type }}
rust_toolchain: ${{ matrix.rust_toolchain }}
test_selection: batch_others
run_with_real_s3: true
real_s3_bucket: ci-tests-s3
real_s3_region: us-west-2
@@ -298,12 +273,35 @@ jobs:
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
merge-allure-report:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ regress-tests, benchmarks ]
if: always()
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
- name: Create Allure report
uses: ./.github/actions/allure-report
with:
action: generate
build_type: ${{ matrix.build_type }}
coverage-report:
runs-on: dev
container:
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
options: --init
needs: [ other-tests, pg_regress-tests ]
needs: [ regress-tests ]
strategy:
fail-fast: false
matrix:
@@ -325,7 +323,7 @@ jobs:
!~/.cargo/registry/src
~/.cargo/git/
target/
key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
@@ -460,19 +458,18 @@ jobs:
- name: Configure ECR login
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
- name: Kaniko build compute node
working-directory: ./vendor/postgres/
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
- name: Kaniko build compute node with extensions
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
promote-images:
runs-on: dev
needs: [ neon-image, compute-tools-image, compute-node-image ]
needs: [ neon-image, compute-node-image, compute-tools-image ]
if: github.event_name != 'workflow_dispatch'
container: amazon/aws-cli
strategy:
fail-fast: false
matrix:
name: [ neon, compute-tools, compute-node ]
name: [ neon, compute-node, compute-tools ]
steps:
- name: Promote image to latest
@@ -489,18 +486,6 @@ jobs:
run: |
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
# - name: Get build tag
# run: |
# if [[ "$GITHUB_REF_NAME" == "main" ]]; then
# echo "::set-output name=tag::$(git rev-list --count HEAD)"
# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
# echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
# else
# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' "
# echo "::set-output name=tag::$GITHUB_RUN_ID"
# fi
# id: build-tag
- name: Configure ECR login
run: |
@@ -516,6 +501,9 @@ jobs:
- name: Pull compute node image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
- name: Pull rust image from ECR
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
- name: Configure docker login
run: |
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
@@ -531,6 +519,9 @@ jobs:
- name: Push compute node image to Docker Hub
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
- name: Push rust image to Docker Hub
run: crane push rust neondatabase/rust:pinned
- name: Add latest tag to images
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
@@ -567,7 +558,7 @@ jobs:
#container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
needs: [ push-docker-hub, calculate-deploy-targets, tag ]
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
@@ -622,7 +613,7 @@ jobs:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
needs: [ push-docker-hub, calculate-deploy-targets, tag ]
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'

View File

@@ -65,7 +65,7 @@ jobs:
- name: Cache postgres build
id: cache_pg
uses: actions/cache@v2
uses: actions/cache@v3
with:
path: |
tmp_install/
@@ -81,6 +81,9 @@ jobs:
if: steps.cache_pg.outputs.cache-hit != 'true'
run: make postgres
- name: Build neon extensions
run: make neon-pg-ext
# Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
# and the real cause will be inside config.log
- name: Print configure logs in case of failure
@@ -94,20 +97,20 @@ jobs:
- name: Cache cargo deps
id: cache_cargo
uses: actions/cache@v2
uses: actions/cache@v3
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
- name: Run cargo clippy
run: ./run_clippy.sh
- name: Ensure all project builds
run: cargo build --all --all-targets
run: cargo build --locked --all --all-targets
check-codestyle-python:
runs-on: [ self-hosted, Linux, k8s-runner ]
@@ -128,8 +131,14 @@ jobs:
- name: Install Python deps
run: ./scripts/pysync
- name: Run yapf to ensure code format
run: poetry run yapf --recursive --diff .
- name: Run isort to ensure code format
run: poetry run isort --diff --check .
- name: Run black to ensure code format
run: poetry run black --diff --check .
- name: Run flake8 to ensure code format
run: poetry run flake8 .
- name: Run mypy to check types
run: poetry run mypy .

View File

@@ -1,45 +0,0 @@
name: Send Notifications
on:
push:
branches: [ main ]
jobs:
send-notifications:
timeout-minutes: 30
name: send commit notifications
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
fetch-depth: 2
- name: Form variables for notification message
id: git_info_grab
run: |
git_stat=$(git show --stat=50)
git_stat="${git_stat//'%'/'%25'}"
git_stat="${git_stat//$'\n'/'%0A'}"
git_stat="${git_stat//$'\r'/'%0D'}"
git_stat="${git_stat// /}" # space -> 'Space En', as github tends to eat ordinary spaces
echo "::set-output name=git_stat::$git_stat"
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
- name: Send notification
uses: appleboy/telegram-action@master
with:
to: ${{ secrets.TELEGRAM_TO }}
token: ${{ secrets.TELEGRAM_TOKEN }}
format: markdown
args: |
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
```
${{ steps.git_info_grab.outputs.git_stat }}
```

View File

@@ -1,10 +0,0 @@
# This file is only read when `yapf` is run from this directory.
# Hence we only top-level directories here to avoid confusion.
# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
vendor/
target/
tmp_install/
__pycache__/
test_output/
.neon/
.git/

247
Cargo.lock generated
View File

@@ -48,9 +48,9 @@ dependencies = [
[[package]]
name = "anyhow"
version = "1.0.58"
version = "1.0.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704"
checksum = "c91f1f46651137be86f3a2b9a8359f9ab421d04d941c62b5982e1ca21113adf9"
dependencies = [
"backtrace",
]
@@ -77,7 +77,7 @@ dependencies = [
"num-traits",
"rusticata-macros",
"thiserror",
"time 0.3.11",
"time 0.3.12",
]
[[package]]
@@ -126,9 +126,9 @@ dependencies = [
[[package]]
name = "async-trait"
version = "0.1.56"
version = "0.1.57"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716"
checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f"
dependencies = [
"proc-macro2",
"quote",
@@ -166,7 +166,7 @@ dependencies = [
"http",
"http-body",
"hyper",
"itoa 1.0.2",
"itoa 1.0.3",
"matchit",
"memchr",
"mime",
@@ -298,9 +298,9 @@ checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3"
[[package]]
name = "bytemuck"
version = "1.10.0"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a"
checksum = "a5377c8865e74a160d21f29c2d40669f53286db6eab59b88540cbb12ffc8b835"
[[package]]
name = "byteorder"
@@ -310,9 +310,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
[[package]]
name = "bytes"
version = "1.1.0"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8"
checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db"
dependencies = [
"serde",
]
@@ -386,9 +386,9 @@ dependencies = [
[[package]]
name = "clap"
version = "3.2.12"
version = "3.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d"
checksum = "a3dbbb6653e7c55cc8595ad3e1f7be8f32aba4eb7ff7f0fd1163d4f3d137c0a9"
dependencies = [
"atty",
"bitflags",
@@ -455,7 +455,7 @@ version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"clap 3.2.12",
"clap 3.2.16",
"env_logger",
"hyper",
"log",
@@ -601,9 +601,9 @@ dependencies = [
[[package]]
name = "crossbeam-channel"
version = "0.5.5"
version = "0.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c"
checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
dependencies = [
"cfg-if",
"crossbeam-utils",
@@ -611,9 +611,9 @@ dependencies = [
[[package]]
name = "crossbeam-deque"
version = "0.8.1"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
dependencies = [
"cfg-if",
"crossbeam-epoch",
@@ -622,9 +622,9 @@ dependencies = [
[[package]]
name = "crossbeam-epoch"
version = "0.9.9"
version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d"
checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1"
dependencies = [
"autocfg",
"cfg-if",
@@ -636,9 +636,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.10"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83"
checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
dependencies = [
"cfg-if",
"once_cell",
@@ -917,9 +917,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
[[package]]
name = "fastrand"
version = "1.7.0"
version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf"
checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
dependencies = [
"instant",
]
@@ -1086,9 +1086,9 @@ dependencies = [
[[package]]
name = "generic-array"
version = "0.14.5"
version = "0.14.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803"
checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
dependencies = [
"typenum",
"version_check",
@@ -1164,20 +1164,14 @@ version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
[[package]]
name = "hashbrown"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
dependencies = [
"ahash",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
dependencies = [
"ahash",
]
[[package]]
name = "heck"
@@ -1245,7 +1239,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399"
dependencies = [
"bytes",
"fnv",
"itoa 1.0.2",
"itoa 1.0.3",
]
[[package]]
@@ -1308,7 +1302,7 @@ dependencies = [
"http-body",
"httparse",
"httpdate",
"itoa 1.0.2",
"itoa 1.0.3",
"pin-project-lite",
"socket2",
"tokio",
@@ -1379,7 +1373,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
dependencies = [
"autocfg",
"hashbrown 0.12.3",
"hashbrown",
]
[[package]]
@@ -1391,7 +1385,7 @@ dependencies = [
"ahash",
"atty",
"indexmap",
"itoa 1.0.2",
"itoa 1.0.3",
"lazy_static",
"log",
"num-format",
@@ -1432,15 +1426,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]]
name = "itoa"
version = "1.0.2"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d"
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
[[package]]
name = "js-sys"
version = "0.3.58"
version = "0.3.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27"
checksum = "258451ab10b34f8af53416d1fdab72c22e805f0c92a1136d59470ec0b11138b2"
dependencies = [
"wasm-bindgen",
]
@@ -1482,9 +1476,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
[[package]]
name = "libc"
version = "0.2.126"
version = "0.2.127"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
checksum = "505e71a4706fa491e9b1b55f51b95d4037d0821ee40131190475f692b35b009b"
[[package]]
name = "libloading"
@@ -1659,7 +1653,7 @@ name = "neon_local"
version = "0.1.0"
dependencies = [
"anyhow",
"clap 3.2.12",
"clap 3.2.16",
"comfy-table",
"control_plane",
"git-version",
@@ -1854,7 +1848,7 @@ dependencies = [
"byteorder",
"bytes",
"chrono",
"clap 3.2.12",
"clap 3.2.16",
"close_fds",
"const_format",
"crc32c",
@@ -2111,7 +2105,6 @@ dependencies = [
"bindgen",
"byteorder",
"bytes",
"chrono",
"crc32c",
"env_logger",
"hex",
@@ -2155,9 +2148,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
[[package]]
name = "prettyplease"
version = "0.1.16"
version = "0.1.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2"
checksum = "697ae720ee02011f439e0701db107ffe2916d83f718342d65d7f8bf7b8a5fee9"
dependencies = [
"proc-macro2",
"syn",
@@ -2171,9 +2164,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
[[package]]
name = "proc-macro2"
version = "1.0.40"
version = "1.0.43"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7"
checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
dependencies = [
"unicode-ident",
]
@@ -2271,13 +2264,14 @@ dependencies = [
"base64",
"bstr",
"bytes",
"clap 3.2.12",
"clap 3.2.16",
"futures",
"git-version",
"hashbrown 0.11.2",
"hashbrown",
"hex",
"hmac 0.12.1",
"hyper",
"itertools",
"md5",
"metrics",
"once_cell",
@@ -2289,7 +2283,7 @@ dependencies = [
"routerify",
"rstest",
"rustls",
"rustls-pemfile 0.2.1",
"rustls-pemfile",
"scopeguard",
"serde",
"serde_json",
@@ -2315,20 +2309,11 @@ dependencies = [
"memchr",
]
[[package]]
name = "quickcheck"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
dependencies = [
"rand",
]
[[package]]
name = "quote"
version = "1.0.20"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804"
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
dependencies = [
"proc-macro2",
]
@@ -2411,9 +2396,9 @@ dependencies = [
[[package]]
name = "redox_syscall"
version = "0.2.13"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42"
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
dependencies = [
"bitflags",
]
@@ -2508,7 +2493,7 @@ dependencies = [
"percent-encoding",
"pin-project-lite",
"rustls",
"rustls-pemfile 1.0.0",
"rustls-pemfile",
"serde",
"serde_json",
"serde_urlencoded",
@@ -2699,18 +2684,9 @@ dependencies = [
[[package]]
name = "rustls-pemfile"
version = "0.2.1"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9"
dependencies = [
"base64",
]
[[package]]
name = "rustls-pemfile"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7522c9de787ff061458fe9a829dc790a3f5b22dc571694fc5883f448b94d9a9"
checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
dependencies = [
"base64",
]
@@ -2726,15 +2702,15 @@ dependencies = [
[[package]]
name = "rustversion"
version = "1.0.8"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"
checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8"
[[package]]
name = "ryu"
version = "1.0.10"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
[[package]]
name = "safekeeper"
@@ -2744,7 +2720,7 @@ dependencies = [
"async-trait",
"byteorder",
"bytes",
"clap 3.2.12",
"clap 3.2.16",
"const_format",
"crc32c",
"daemonize",
@@ -2835,15 +2811,15 @@ dependencies = [
[[package]]
name = "semver"
version = "1.0.12"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2333e6df6d6598f2b1974829f853c2b4c5f4a6e503c10af918081aa6f8564e1"
checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711"
[[package]]
name = "serde"
version = "1.0.139"
version = "1.0.142"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6"
checksum = "e590c437916fb6b221e1d00df6e3294f3fccd70ca7e92541c475d6ed6ef5fee2"
dependencies = [
"serde_derive",
]
@@ -2860,9 +2836,9 @@ dependencies = [
[[package]]
name = "serde_derive"
version = "1.0.139"
version = "1.0.142"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb"
checksum = "34b5b8d809babe02f538c2cfec6f2c1ed10804c0e5a6a041a049a4f5588ccc2e"
dependencies = [
"proc-macro2",
"quote",
@@ -2871,11 +2847,11 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.82"
version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7"
checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7"
dependencies = [
"itoa 1.0.2",
"itoa 1.0.3",
"ryu",
"serde",
]
@@ -2887,7 +2863,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
dependencies = [
"form_urlencoded",
"itoa 1.0.2",
"itoa 1.0.3",
"ryu",
"serde",
]
@@ -2992,7 +2968,7 @@ dependencies = [
"num-bigint",
"num-traits",
"thiserror",
"time 0.3.11",
"time 0.3.12",
]
[[package]]
@@ -3003,9 +2979,12 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
[[package]]
name = "slab"
version = "0.4.6"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32"
checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
dependencies = [
"autocfg",
]
[[package]]
name = "smallvec"
@@ -3113,9 +3092,9 @@ dependencies = [
[[package]]
name = "syn"
version = "1.0.98"
version = "1.0.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd"
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
dependencies = [
"proc-macro2",
"quote",
@@ -3191,18 +3170,18 @@ checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
[[package]]
name = "thiserror"
version = "1.0.31"
version = "1.0.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a"
checksum = "f5f6586b7f764adc0231f4c79be7b920e766bb2f3e51b3661cdb263828f19994"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.31"
version = "1.0.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a"
checksum = "12bafc5b54507e0149cdf1b145a5d80ab80a90bcd9275df43d4fff68460f6c21"
dependencies = [
"proc-macro2",
"quote",
@@ -3231,14 +3210,14 @@ dependencies = [
[[package]]
name = "time"
version = "0.3.11"
version = "0.3.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217"
checksum = "74b7cc93fc23ba97fde84f7eea56c55d1ba183f495c6715defdfc7b9cb8c870f"
dependencies = [
"itoa 1.0.2",
"itoa 1.0.3",
"js-sys",
"libc",
"num_threads",
"quickcheck",
"time-macros",
]
@@ -3275,9 +3254,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
[[package]]
name = "tokio"
version = "1.20.0"
version = "1.20.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57aec3cfa4c296db7255446efb4928a6be304b431a806216105542a67b6ca82e"
checksum = "7a8325f63a7d4774dd041e363b2409ed1c5cbbd0f867795e661df066b2b0a581"
dependencies = [
"autocfg",
"bytes",
@@ -3607,9 +3586,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"
[[package]]
name = "unicode-ident"
version = "1.0.2"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf"
[[package]]
name = "unicode-normalization"
@@ -3679,7 +3658,7 @@ dependencies = [
"rand",
"routerify",
"rustls",
"rustls-pemfile 0.2.1",
"rustls-pemfile",
"rustls-split",
"serde",
"serde_json",
@@ -3728,7 +3707,7 @@ name = "wal_craft"
version = "0.1.0"
dependencies = [
"anyhow",
"clap 3.2.12",
"clap 3.2.16",
"env_logger",
"log",
"once_cell",
@@ -3772,9 +3751,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
version = "0.2.81"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994"
checksum = "fc7652e3f6c4706c8d9cd54832c4a4ccb9b5336e2c3bd154d5cccfbf1c1f5f7d"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
@@ -3782,13 +3761,13 @@ dependencies = [
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.81"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a"
checksum = "662cd44805586bd52971b9586b1df85cdbbd9112e4ef4d8f41559c334dc6ac3f"
dependencies = [
"bumpalo",
"lazy_static",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn",
@@ -3797,9 +3776,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.31"
version = "0.4.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f"
checksum = "fa76fb221a1f8acddf5b54ace85912606980ad661ac7a503b4570ffd3a624dad"
dependencies = [
"cfg-if",
"js-sys",
@@ -3809,9 +3788,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.81"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa"
checksum = "b260f13d3012071dfb1512849c033b1925038373aea48ced3012c09df952c602"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
@@ -3819,9 +3798,9 @@ dependencies = [
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.81"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048"
checksum = "5be8e654bdd9b79216c2929ab90721aa82faf65c48cdf08bdc4e7f51357b80da"
dependencies = [
"proc-macro2",
"quote",
@@ -3832,15 +3811,15 @@ dependencies = [
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.81"
version = "0.2.82"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be"
checksum = "6598dd0bd3c7d51095ff6531a5b23e02acdc81804e30d8f07afb77b7215a140a"
[[package]]
name = "web-sys"
version = "0.3.58"
version = "0.3.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90"
checksum = "ed055ab27f941423197eb86b2035720b1a3ce40504df082cac2ecc6ed73335a1"
dependencies = [
"js-sys",
"wasm-bindgen",
@@ -3965,6 +3944,7 @@ version = "0.1.0"
dependencies = [
"ahash",
"anyhow",
"bstr",
"bytes",
"chrono",
"clap 2.34.0",
@@ -3974,7 +3954,7 @@ dependencies = [
"futures-task",
"futures-util",
"generic-array",
"hashbrown 0.11.2",
"hashbrown",
"hex",
"hyper",
"indexmap",
@@ -3989,11 +3969,12 @@ dependencies = [
"prost",
"rand",
"regex",
"regex-automata",
"regex-syntax",
"scopeguard",
"serde",
"syn",
"time 0.3.11",
"time 0.3.12",
"tokio",
"tokio-util",
"tracing",
@@ -4015,7 +3996,7 @@ dependencies = [
"oid-registry",
"rusticata-macros",
"thiserror",
"time 0.3.11",
"time 0.3.12",
]
[[package]]
@@ -4044,6 +4025,6 @@ dependencies = [
[[package]]
name = "zeroize"
version = "1.5.6"
version = "1.5.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20b578acffd8516a6c3f2a1bdefc1ec37e547bb4e0fb8b6b01a4cafc886b4442"
checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f"

View File

@@ -1,18 +1,27 @@
### Creates a storage Docker image with postgres, pageserver, safekeeper and proxy binaries.
### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters.
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
### inside this image in the real deployments.
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=rust
ARG TAG=pinned
# Build Postgres
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
WORKDIR /home/nonroot
COPY vendor/postgres vendor/postgres
COPY Makefile Makefile
COPY --chown=nonroot vendor/postgres vendor/postgres
COPY --chown=nonroot pgxn pgxn
COPY --chown=nonroot Makefile Makefile
ENV BUILD_TYPE release
RUN set -e \
&& mold -run make -j $(nproc) -s postgres \
&& mold -run make -j $(nproc) -s neon-pg-ext \
&& rm -rf tmp_install/build \
&& tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz .
# Build zenith binaries
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS build
FROM $REPOSITORY/$IMAGE:$TAG AS build
WORKDIR /home/nonroot
ARG GIT_VERSION=local
@@ -32,7 +41,7 @@ COPY . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
RUN set -e \
&& mold -run cargo build --release \
&& mold -run cargo build --locked --release \
&& cachepot -s
# Build final image
@@ -58,7 +67,18 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy
COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
-c "id=1234" \
-c "broker_endpoints=['http://etcd:2379']" \
-c "pg_distrib_dir='/usr/local'" \
-c "listen_pg_addr='0.0.0.0:6400'" \
-c "listen_http_addr='0.0.0.0:9898'"
VOLUME ["/data"]
USER zenith
EXPOSE 6400
CMD ["pageserver"]
EXPOSE 9898
CMD ["/bin/bash"]

114
Dockerfile.compute-node Normal file
View File

@@ -0,0 +1,114 @@
ARG TAG=pinned
# apparently, ARGs don't get replaced in RUN commands in kaniko
# ARG POSTGIS_VERSION=3.3.0
# ARG PLV8_VERSION=3.1.4
FROM debian:bullseye-slim AS build-deps
RUN apt update && \
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libcurl4-openssl-dev libossp-uuid-dev
# Build Postgres from the neon postgres repository.
FROM build-deps AS pg-build
COPY vendor/postgres postgres
RUN cd postgres && \
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
# Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes.
# Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some
# investigation to check that it works, and also keeps working in the future. So for now, we compile our own binaries.
FROM build-deps AS postgis-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
tar xvzf postgis-3.3.0.tar.gz && \
cd postgis-3.3.0 && \
./autogen.sh && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
./configure && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
cd extensions/postgis && \
make clean && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
# Build plv8
FROM build-deps AS plv8-build
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
RUN apt update && \
apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
# https://github.com/plv8/plv8/issues/475
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
apt update && \
apt install -y --no-install-recommends -t testing binutils
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
tar xvzf v3.1.4.tar.gz && \
cd plv8-3.1.4 && \
export PATH="/usr/local/pgsql/bin:$PATH" && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
rm -rf /plv8-* && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
# compile neon extensions
FROM build-deps AS neon-pg-ext-build
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
COPY pgxn/ pgxn/
RUN make -j $(getconf _NPROCESSORS_ONLN) \
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-C pgxn/neon \
-s install
# Compile and run the Neon-specific `compute_ctl` binary
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
USER nonroot
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
COPY --chown=nonroot . .
RUN cd compute_tools && cargo build --locked --release
# Put it all together into the final image
FROM debian:bullseye-slim
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute && \
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
# TODO: Check if we can make the extension setup more modular versus a linear build
# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
RUN apt update && \
apt install -y libreadline-dev libossp-uuid-dev gdal-bin libgdal-dev libprotobuf-c-dev && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# Debian bullseye provides GLIBC 2.31 when 2.34 is necessary as we compiled plv8 with that version
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
apt update && \
apt install -y --no-install-recommends -t testing binutils && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
# "temporary" symlink for old control-plane
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -0,0 +1,87 @@
#
# Legacy version of the Dockerfile for the compute node.
# Used by e2e CI. Building Dockerfile.compute-node will take
# unreasonable ammount of time without v2 runners.
#
# TODO: remove once cloud repo CI is moved to v2 runners.
#
# Allow specifiyng different compute-tools tag and image repo, so we are
# able to use different images
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=compute-tools
ARG TAG=latest
#
# Image with pre-built tools
#
FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
# Only to get ready compute_ctl binary as deppendency
#
# Image with Postgres build deps
#
FROM debian:buster-slim AS build-deps
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
libcurl4-openssl-dev libossp-uuid-dev
#
# Image with built Postgres
#
FROM build-deps AS pg-build
# Add user postgres
RUN adduser postgres
RUN mkdir /pg && chown postgres:postgres /pg
# Copy source files
COPY ./vendor/postgres /pg/
COPY ./pgxn /pg/
# Build and install Postgres locally
RUN mkdir /pg/compute_build && cd /pg/compute_build && \
../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
# Install main binaries and contribs
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
# Install headers
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
# Install neon contrib
RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
USER postgres
WORKDIR /pg
#
# Final compute node image to be exported
#
FROM debian:buster-slim
# libreadline-dev is required to run psql
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
# Add user postgres
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
echo "postgres:test_console_pass" | chpasswd && \
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
chown -R postgres:postgres /var/db/postgres && \
chmod 0750 /var/db/postgres/compute
# Copy ready Postgres binaries
COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
# Copy binaries from compute-tools
COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
# XXX: temporary symlink for compatibility with old control-plane
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
# Add postgres shared objects to the search path
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
USER postgres
ENTRYPOINT ["/usr/local/bin/compute_ctl"]

View File

@@ -1,6 +1,10 @@
# First transient image to build compute_tools binaries
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS rust-build
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
ARG IMAGE=rust
ARG TAG=pinned
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
WORKDIR /home/nonroot
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
@@ -16,7 +20,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev
COPY . .
RUN set -e \
&& mold -run cargo build -p compute_tools --release \
&& mold -run cargo build -p compute_tools --locked --release \
&& cachepot -s
# Final image that only has one binary

View File

@@ -51,7 +51,7 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
# Top level Makefile to build Zenith and PostgreSQL
#
.PHONY: all
all: zenith postgres
all: zenith postgres neon-pg-ext
### Zenith Rust bits
#
@@ -87,25 +87,39 @@ postgres: postgres-configure \
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
+@echo "Compiling PostgreSQL"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
+@echo "Compiling contrib/neon"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
+@echo "Compiling contrib/neon_test_utils"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
+@echo "Compiling libpq"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install
+@echo "Compiling pg_buffercache"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
+@echo "Compiling pageinspect"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install
.PHONY: postgres-clean
postgres-clean:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean
neon-pg-ext: postgres
+@echo "Compiling neon"
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \
-C $(ROOT_PROJECT_DIR)/pgxn/neon install
+@echo "Compiling neon_test_utils"
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \
-C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install
.PHONY: neon-pg-ext-clean
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean:
cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
$(CARGO_CMD_PREFIX) cargo clean
cd pgxn/neon && $(MAKE) clean
cd pgxn/neon_test_utils && $(MAKE) clean
# This removes everything
.PHONY: distclean

View File

@@ -178,6 +178,7 @@ impl ComputeNode {
.args(&["--sync-safekeepers"])
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.expect("postgres --sync-safekeepers failed to start");
@@ -187,10 +188,13 @@ impl ComputeNode {
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
if !sync_output.status.success() {
anyhow::bail!(
"postgres --sync-safekeepers exited with non-zero status: {}",
"postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}",
sync_output.status,
String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"),
String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"),
);
}

View File

@@ -62,9 +62,16 @@ impl GenericOption {
/// Represent `GenericOption` as configuration option.
pub fn to_pg_setting(&self) -> String {
if let Some(val) = &self.value {
let name = match self.name.as_str() {
"safekeepers" => "neon.safekeepers",
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
"wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
it => it,
};
match self.vartype.as_ref() {
"string" => format!("{} = '{}'", self.name, val),
_ => format!("{} = {}", self.name, val),
"string" => format!("{} = '{}'", name, val),
_ => format!("{} = {}", name, val),
}
} else {
self.name.to_owned()

View File

@@ -85,7 +85,7 @@
"vartype": "bool"
},
{
"name": "safekeepers",
"name": "neon.safekeepers",
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
"vartype": "string"
},
@@ -181,7 +181,6 @@
}
]
},
"delta_operations": [
{
"action": "delete_db",

View File

@@ -28,7 +28,7 @@ mod pg_helpers_tests {
assert_eq!(
spec.cluster.settings.as_pg_settings(),
"fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
"fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
);
}

View File

@@ -150,7 +150,7 @@ impl PostgresNode {
let port: u16 = conf.parse_field("port", &context)?;
let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?;
let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?;
let uses_wal_proposer = conf.get("safekeepers").is_some();
let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
// parse recovery_target_lsn, if any
let recovery_target_lsn: Option<Lsn> =
@@ -341,7 +341,7 @@ impl PostgresNode {
.map(|sk| format!("localhost:{}", sk.pg_port))
.collect::<Vec<String>>()
.join(",");
conf.append("safekeepers", &safekeepers);
conf.append("neon.safekeepers", &safekeepers);
} else {
// We only use setup without safekeepers for tests,
// and don't care about data durability on pageserver,

View File

@@ -92,6 +92,7 @@ The layer map tracks what layers exist in a timeline.
### Layered repository
Neon repository implementation that keeps data in layers.
### LSN
The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
@@ -125,6 +126,26 @@ TODO: use this name consistently in remote storage code. Now `disk_consistent_ls
* `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created)
TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs.
### Logical size
The pageserver tracks the "logical size" of a timeline. It is the
total size of all relations in all Postgres databases on the
timeline. It includes all user and system tables, including their FSM
and VM forks. But it does not include SLRUs, twophase files or any
other such data or metadata that lives outside relations.
The logical size is calculated by the pageserver, and is sent to
PostgreSQL via feedback messages to the safekeepers. PostgreSQL uses
the logical size to enforce the size limit in the free tier. The
logical size is also shown to users in the web console.
The logical size is not affected by branches or the physical layout of
layer files in the pageserver. If you have a database with 1 GB
logical size and you create a branch of it, both branches will have 1
GB logical size, even though the branch is copy-on-write and won't
consume any extra physical disk space until you make changes to it.
### Page (block)
The basic structure used to store relation data. All pages are of the same size.

View File

@@ -112,11 +112,13 @@ Run `poetry shell` to activate the virtual environment.
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
### Obligatory checks
We force code formatting via `yapf` and type hints via `mypy`.
Run the following commands in the repository's root (next to `setup.cfg`):
We force code formatting via `black`, `isort` and type hints via `mypy`.
Run the following commands in the repository's root (next to `pyproject.toml`):
```bash
poetry run yapf -ri . # All code is reformatted
poetry run isort . # Imports are reformatted
poetry run black . # All code is reformatted
poetry run flake8 . # Python linter
poetry run mypy . # Ensure there are no typing errors
```
@@ -125,7 +127,7 @@ Otherwise it will not find its configuration.
Also consider:
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
* Running `pycodestyle` (or a linter of your choice) and fixing possible defects, if any.
* Adding more type hints to your code to avoid `Any`.
### Changing dependencies

View File

@@ -4,7 +4,6 @@ version = "0.1.0"
edition = "2021"
[dependencies]
chrono = "0.4.19"
rand = "0.8.3"
regex = "1.4.5"
bytes = "1.0.1"

View File

@@ -1,8 +1,8 @@
//!
//! Common utilities for dealing with PostgreSQL non-relation files.
//!
use crate::transaction_id_precedes;
use super::pg_constants;
use crate::transaction_id_precedes;
use bytes::BytesMut;
use log::*;

View File

@@ -8,9 +8,9 @@
//! to look deeper into the WAL records to also understand which blocks they modify, the code
//! for that is in pageserver/src/walrecord.rs
//!
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC};
use super::pg_constants;
use super::xlog_utils::*;
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use crc32c::*;
use log::*;
@@ -170,6 +170,7 @@ impl WalStreamDecoder {
}
State::SkippingEverything { .. } => {}
}
// now read page contents
match &mut self.state {
State::WaitingForRecord => {
// need to have at least the xl_tot_len field
@@ -194,8 +195,8 @@ impl WalStreamDecoder {
return Ok(Some(self.complete_record(recordbuf)?));
} else {
// Need to assemble the record from pieces. Remember the size of the
// record, and loop back. On next iteration, we will reach the 'else'
// branch below, and copy the part of the record that was on this page
// record, and loop back. On next iterations, we will reach the branch
// below, and copy the part of the record that was on this or next page(s)
// to 'recordbuf'. Subsequent iterations will skip page headers, and
// append the continuations from the next pages to 'recordbuf'.
self.state = State::ReassemblingRecord {

View File

@@ -42,19 +42,13 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
pub trait RemoteObjectName {
// Needed to retrieve last component for RemoteObjectId.
// In other words a file name
fn object_name(&self) -> Option<&str>;
}
/// Storage (potentially remote) API to manage its state.
/// This storage tries to be unaware of any layered repository context,
/// providing basic CRUD operations for storage files.
#[async_trait::async_trait]
pub trait RemoteStorage: Send + Sync {
/// A way to uniquely reference a file in the remote storage.
type RemoteObjectId: RemoteObjectName;
type RemoteObjectId;
/// Attempts to derive the storage path out of the local path, if the latter is correct.
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
@@ -71,7 +65,7 @@ pub trait RemoteStorage: Send + Sync {
/// so this method doesnt need to.
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
prefix: Option<&Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
/// Streams the local file contents into remote into the remote storage entry.
@@ -163,6 +157,13 @@ impl GenericRemoteStorage {
}
}
}
pub fn as_local(&self) -> Option<&LocalFs> {
match self {
Self::Local(local_fs) => Some(local_fs),
_ => None,
}
}
}
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.

View File

@@ -5,7 +5,6 @@
//! volume is mounted to the local FS.
use std::{
borrow::Cow,
future::Future,
path::{Path, PathBuf},
pin::Pin,
@@ -18,16 +17,10 @@ use tokio::{
};
use tracing::*;
use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName};
use crate::{path_with_suffix_extension, Download, DownloadError};
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
impl RemoteObjectName for PathBuf {
fn object_name(&self) -> Option<&str> {
self.file_stem().and_then(|n| n.to_str())
}
}
pub struct LocalFs {
working_directory: PathBuf,
storage_root: PathBuf,
@@ -113,13 +106,10 @@ impl RemoteStorage for LocalFs {
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
prefix: Option<&Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
let path = match prefix {
Some(prefix) => Cow::Owned(prefix),
None => Cow::Borrowed(&self.storage_root),
};
get_all_files(path.as_ref(), false).await
let path = prefix.unwrap_or(&self.storage_root);
get_all_files(path, false).await
}
async fn upload(
@@ -150,8 +140,7 @@ impl RemoteStorage for LocalFs {
);
let from_size_bytes = from_size_bytes as u64;
// Require to read 1 byte more than the expected to check later, that the stream and its size match.
let mut buffer_to_read = from.take(from_size_bytes + 1);
let mut buffer_to_read = from.take(from_size_bytes);
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
.await
@@ -162,17 +151,15 @@ impl RemoteStorage for LocalFs {
)
})?;
if bytes_read < from_size_bytes {
bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes");
}
// Check if there is any extra data after the given size.
let mut from = buffer_to_read.into_inner();
let extra_read = from.read(&mut [1]).await?;
ensure!(
bytes_read == from_size_bytes,
"Provided stream has actual size {} fthat is smaller than the given stream size {}",
bytes_read,
from_size_bytes
);
ensure!(
buffer_to_read.read(&mut [0]).await? == 0,
"Provided stream has bigger size than the given stream size {}",
from_size_bytes
extra_read == 0,
"Provided stream was larger than expected: expected {from_size_bytes} bytes",
);
destination.flush().await.with_context(|| {
@@ -609,6 +596,34 @@ mod fs_tests {
Ok(())
}
#[tokio::test]
async fn upload_file_negatives() -> anyhow::Result<()> {
let storage = create_storage()?;
let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?;
let content = std::io::Cursor::new(b"12345");
// Check that you get an error if the size parameter doesn't match the actual
// size of the stream.
storage
.upload(content.clone(), 0, &id, None)
.await
.expect_err("upload with zero size succeeded");
storage
.upload(content.clone(), 4, &id, None)
.await
.expect_err("upload with too short size succeeded");
storage
.upload(content.clone(), 6, &id, None)
.await
.expect_err("upload with too large size succeeded");
// Correct size is 5, this should succeed.
storage.upload(content, 5, &id, None).await?;
Ok(())
}
fn create_storage() -> anyhow::Result<LocalFs> {
LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
}

View File

@@ -19,9 +19,7 @@ use tokio::{io, sync::Semaphore};
use tokio_util::io::ReaderStream;
use tracing::debug;
use crate::{
strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config,
};
use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config};
use super::StorageMetadata;
@@ -96,6 +94,23 @@ const S3_PREFIX_SEPARATOR: char = '/';
pub struct S3ObjectKey(String);
impl S3ObjectKey {
/// Turn a/b/c or a/b/c/ into c
pub fn object_name(&self) -> Option<&str> {
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
// see https://github.com/rust-lang/rust/issues/88674
if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
return None;
}
if self.0.ends_with(S3_PREFIX_SEPARATOR) {
self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
} else {
self.0
.rsplit_once(S3_PREFIX_SEPARATOR)
.map(|(_, last)| last)
}
}
fn key(&self) -> &str {
&self.0
}
@@ -119,25 +134,6 @@ impl S3ObjectKey {
}
}
impl RemoteObjectName for S3ObjectKey {
/// Turn a/b/c or a/b/c/ into c
fn object_name(&self) -> Option<&str> {
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
// see https://github.com/rust-lang/rust/issues/88674
if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
return None;
}
if self.0.ends_with(S3_PREFIX_SEPARATOR) {
self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
} else {
self.0
.rsplit_once(S3_PREFIX_SEPARATOR)
.map(|(_, last)| last)
}
}
}
/// AWS S3 storage.
pub struct S3Bucket {
workdir: PathBuf,
@@ -316,11 +312,11 @@ impl RemoteStorage for S3Bucket {
/// Note: it wont include empty "directories"
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
prefix: Option<&Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| p.0)
.map(|p| p.0.clone())
.or_else(|| self.prefix_in_bucket.clone())
.map(|mut p| {
// required to end with a separator

View File

@@ -39,7 +39,7 @@ bytes = "1.0.1"
hex-literal = "0.3"
tempfile = "3.2"
criterion = "0.3"
rustls-pemfile = "0.2.1"
rustls-pemfile = "1"
[[bench]]
name = "benchmarks"

View File

@@ -8,6 +8,9 @@ pub mod lsn;
/// SeqWait allows waiting for a future sequence number to arrive
pub mod seqwait;
/// A simple Read-Copy-Update implementation.
pub mod simple_rcu;
/// append only ordered map implemented with a Vec
pub mod vec_map;

View File

@@ -163,14 +163,9 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
false
}
// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations)
// PG protocol strings are always C strings.
fn cstr_to_str(b: &Bytes) -> Result<&str> {
let without_null = if b.last() == Some(&0) {
&b[..b.len() - 1]
} else {
&b[..]
};
// Cast a byte slice to a string slice, dropping null terminator if there's one.
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
std::str::from_utf8(without_null).map_err(|e| e.into())
}
@@ -423,9 +418,9 @@ impl PostgresBackend {
self.state = ProtoState::Established;
}
FeMessage::Query(m) => {
FeMessage::Query(body) => {
// remove null terminator
let query_string = cstr_to_str(&m.body)?;
let query_string = cstr_to_str(&body)?;
trace!("got query {:?}", query_string);
// xxx distinguish fatal and recoverable errors?

View File

@@ -7,11 +7,14 @@ use anyhow::{bail, ensure, Context, Result};
use bytes::{Buf, BufMut, Bytes, BytesMut};
use postgres_protocol::PG_EPOCH;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::future::Future;
use std::io::{self, Cursor};
use std::str;
use std::time::{Duration, SystemTime};
use std::{
borrow::Cow,
collections::HashMap,
future::Future,
io::{self, Cursor},
str,
time::{Duration, SystemTime},
};
use tokio::io::AsyncReadExt;
use tracing::{trace, warn};
@@ -25,8 +28,10 @@ pub const TEXT_OID: Oid = 25;
#[derive(Debug)]
pub enum FeMessage {
StartupPacket(FeStartupPacket),
Query(FeQueryMessage), // Simple query
Parse(FeParseMessage), // Extended query protocol
// Simple query.
Query(Bytes),
// Extended query protocol.
Parse(FeParseMessage),
Describe(FeDescribeMessage),
Bind(FeBindMessage),
Execute(FeExecuteMessage),
@@ -51,7 +56,67 @@ pub enum FeStartupPacket {
},
}
pub type StartupMessageParams = HashMap<String, String>;
#[derive(Debug)]
pub struct StartupMessageParams {
params: HashMap<String, String>,
}
impl StartupMessageParams {
/// Get parameter's value by its name.
pub fn get(&self, name: &str) -> Option<&str> {
self.params.get(name).map(|s| s.as_str())
}
/// Split command-line options according to PostgreSQL's logic,
/// taking into account all escape sequences but leaving them as-is.
/// [`None`] means that there's no `options` in [`Self`].
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
// See `postgres: pg_split_opts`.
let mut last_was_escape = false;
let iter = self
.get("options")?
.split(move |c: char| {
// We split by non-escaped whitespace symbols.
let should_split = c.is_ascii_whitespace() && !last_was_escape;
last_was_escape = c == '\\' && !last_was_escape;
should_split
})
.filter(|s| !s.is_empty());
Some(iter)
}
/// Split command-line options according to PostgreSQL's logic,
/// applying all escape sequences (using owned strings as needed).
/// [`None`] means that there's no `options` in [`Self`].
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
// See `postgres: pg_split_opts`.
let iter = self.options_raw()?.map(|s| {
let mut preserve_next_escape = false;
let escape = |c| {
// We should remove '\\' unless it's preceded by '\\'.
let should_remove = c == '\\' && !preserve_next_escape;
preserve_next_escape = should_remove;
should_remove
};
match s.contains('\\') {
true => Cow::Owned(s.replace(escape, "")),
false => Cow::Borrowed(s),
}
});
Some(iter)
}
// This function is mostly useful in tests.
#[doc(hidden)]
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
Self {
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
}
}
}
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
pub struct CancelKeyData {
@@ -69,11 +134,6 @@ impl Distribution<CancelKeyData> for Standard {
}
}
#[derive(Debug)]
pub struct FeQueryMessage {
pub body: Bytes,
}
// We only support the simple case of Parse on unnamed prepared statement and
// no params
#[derive(Debug)]
@@ -89,7 +149,7 @@ pub struct FeDescribeMessage {
// we only support unnamed prepared stmt and portal
#[derive(Debug)]
pub struct FeBindMessage {}
pub struct FeBindMessage;
// we only support unnamed prepared stmt or portal
#[derive(Debug)]
@@ -100,7 +160,7 @@ pub struct FeExecuteMessage {
// we only support unnamed prepared stmt and portal
#[derive(Debug)]
pub struct FeCloseMessage {}
pub struct FeCloseMessage;
/// Retry a read on EINTR
///
@@ -163,22 +223,20 @@ impl FeMessage {
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
Err(e) => return Err(e.into()),
};
let len = retry_read!(stream.read_u32().await)?;
// The message length includes itself, so it better be at least 4
let bodylen = len
// The message length includes itself, so it better be at least 4.
let len = retry_read!(stream.read_u32().await)?
.checked_sub(4)
.context("invalid message length: parsing u32")?;
.context("invalid message length")?;
// Read message body
let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
stream.read_exact(&mut body_buf).await?;
let body = {
let mut buffer = vec![0u8; len as usize];
stream.read_exact(&mut buffer).await?;
Bytes::from(buffer)
};
let body = Bytes::from(body_buf);
// Parse it
match tag {
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))),
b'Q' => Ok(Some(FeMessage::Query(body))),
b'P' => Ok(Some(FeParseMessage::parse(body)?)),
b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
@@ -242,9 +300,9 @@ impl FeStartupPacket {
stream.read_exact(params_bytes.as_mut()).await?;
// Parse params depending on request code
let most_sig_16_bits = request_code >> 16;
let least_sig_16_bits = request_code & ((1 << 16) - 1);
let message = match (most_sig_16_bits, least_sig_16_bits) {
let req_hi = request_code >> 16;
let req_lo = request_code & ((1 << 16) - 1);
let message = match (req_hi, req_lo) {
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
let mut cursor = Cursor::new(params_bytes);
@@ -253,173 +311,115 @@ impl FeStartupPacket {
cancel_key: cursor.read_i32().await?,
})
}
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => FeStartupPacket::SslRequest,
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
// Requested upgrade to SSL (aka TLS)
FeStartupPacket::SslRequest
}
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
// Requested upgrade to GSSAPI
FeStartupPacket::GssEncRequest
}
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
bail!("Unrecognized request code {}", unrecognized_code)
}
// TODO bail if protocol major_version is not 3?
(major_version, minor_version) => {
// TODO bail if protocol major_version is not 3?
// Parse null-terminated (String) pairs of param name / param value
let params_str = str::from_utf8(&params_bytes).unwrap();
let mut params_tokens = params_str.split('\0');
let mut params: HashMap<String, String> = HashMap::new();
while let Some(name) = params_tokens.next() {
let value = params_tokens
// Parse pairs of null-terminated strings (key, value).
// See `postgres: ProcessStartupPacket, build_startup_packet`.
let mut tokens = str::from_utf8(&params_bytes)
.context("StartupMessage params: invalid utf-8")?
.strip_suffix('\0') // drop packet's own null terminator
.context("StartupMessage params: missing null terminator")?
.split_terminator('\0');
let mut params = HashMap::new();
while let Some(name) = tokens.next() {
let value = tokens
.next()
.context("expected even number of params in StartupMessage")?;
if name == "options" {
// parsing options arguments "...&options=<var0>%3D<val0>+<var1>=<var1>..."
// '%3D' is '=' and '+' is ' '
.context("StartupMessage params: key without value")?;
// Note: we allow users that don't have SNI capabilities,
// to pass a special keyword argument 'project'
// to be used to determine the cluster name by the proxy.
//TODO: write unit test for this and refactor in its own function.
for cmdopt in value.split(' ') {
let nameval: Vec<&str> = cmdopt.split('=').collect();
if nameval.len() == 2 {
params.insert(nameval[0].to_string(), nameval[1].to_string());
}
}
} else {
params.insert(name.to_string(), value.to_string());
}
params.insert(name.to_owned(), value.to_owned());
}
FeStartupPacket::StartupMessage {
major_version,
minor_version,
params,
params: StartupMessageParams { params },
}
}
};
Ok(Some(FeMessage::StartupPacket(message)))
})
}
}
impl FeParseMessage {
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
let _pstmt_name = read_null_terminated(&mut buf)?;
let query_string = read_null_terminated(&mut buf)?;
let nparams = buf.get_i16();
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
// FIXME: the rust-postgres driver uses a named prepared statement
// for copy_out(). We're not prepared to handle that correctly. For
// now, just ignore the statement name, assuming that the client never
// uses more than one prepared statement at a time.
/*
if !pstmt_name.is_empty() {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named prepared statements not implemented in Parse",
));
}
*/
if nparams != 0 {
bail!("query params not implemented");
}
let _pstmt_name = read_cstr(&mut buf)?;
let query_string = read_cstr(&mut buf)?;
let nparams = buf.get_i16();
ensure!(nparams == 0, "query params not implemented");
Ok(FeMessage::Parse(FeParseMessage { query_string }))
}
}
impl FeDescribeMessage {
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
let kind = buf.get_u8();
let _pstmt_name = read_null_terminated(&mut buf)?;
let _pstmt_name = read_cstr(&mut buf)?;
// FIXME: see FeParseMessage::parse
/*
if !pstmt_name.is_empty() {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named prepared statements not implemented in Describe",
));
}
*/
if kind != b'S' {
bail!("only prepared statmement Describe is implemented");
}
ensure!(
kind == b'S',
"only prepared statemement Describe is implemented"
);
Ok(FeMessage::Describe(FeDescribeMessage { kind }))
}
}
impl FeExecuteMessage {
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
let portal_name = read_null_terminated(&mut buf)?;
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
let portal_name = read_cstr(&mut buf)?;
let maxrows = buf.get_i32();
if !portal_name.is_empty() {
bail!("named portals not implemented");
}
if maxrows != 0 {
bail!("row limit in Execute message not supported");
}
ensure!(portal_name.is_empty(), "named portals not implemented");
ensure!(maxrows == 0, "row limit in Execute message not implemented");
Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
}
}
impl FeBindMessage {
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
let portal_name = read_null_terminated(&mut buf)?;
let _pstmt_name = read_null_terminated(&mut buf)?;
if !portal_name.is_empty() {
bail!("named portals not implemented");
}
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
let portal_name = read_cstr(&mut buf)?;
let _pstmt_name = read_cstr(&mut buf)?;
// FIXME: see FeParseMessage::parse
/*
if !pstmt_name.is_empty() {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"named prepared statements not implemented",
));
}
*/
ensure!(portal_name.is_empty(), "named portals not implemented");
Ok(FeMessage::Bind(FeBindMessage {}))
Ok(FeMessage::Bind(FeBindMessage))
}
}
impl FeCloseMessage {
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
let _kind = buf.get_u8();
let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
let _pstmt_or_portal_name = read_cstr(&mut buf)?;
// FIXME: we do nothing with Close
Ok(FeMessage::Close(FeCloseMessage {}))
Ok(FeMessage::Close(FeCloseMessage))
}
}
fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result<Bytes> {
let mut result = BytesMut::new();
loop {
if !buf.has_remaining() {
bail!("no null-terminator in string");
}
let byte = buf.get_u8();
if byte == 0 {
break;
}
result.put_u8(byte);
}
Ok(result.freeze())
}
// Backend
#[derive(Debug)]
@@ -441,7 +441,7 @@ pub enum BeMessage<'a> {
// None means column is NULL
DataRow(&'a [Option<&'a [u8]>]),
ErrorResponse(&'a str),
// single byte - used in response to SSLRequest/GSSENCRequest
/// Single byte - used in response to SSLRequest/GSSENCRequest.
EncryptionResponse(bool),
NoData,
ParameterDescription,
@@ -554,49 +554,22 @@ pub static SINGLE_COL_ROWDESC: BeMessage = BeMessage::RowDescription(&[RowDescri
formatcode: 0,
}]);
// Safe usize -> i32|i16 conversion, from rust-postgres
trait FromUsize: Sized {
fn from_usize(x: usize) -> Result<Self, io::Error>;
}
macro_rules! from_usize {
($t:ty) => {
impl FromUsize for $t {
#[inline]
fn from_usize(x: usize) -> io::Result<$t> {
if x > <$t>::max_value() as usize {
Err(io::Error::new(
io::ErrorKind::InvalidInput,
"value too large to transmit",
))
} else {
Ok(x as $t)
}
}
}
};
}
from_usize!(i32);
/// Call f() to write body of the message and prepend it with 4-byte len as
/// prescribed by the protocol.
fn write_body<F>(buf: &mut BytesMut, f: F) -> io::Result<()>
where
F: FnOnce(&mut BytesMut) -> io::Result<()>,
{
fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
let base = buf.len();
buf.extend_from_slice(&[0; 4]);
f(buf)?;
let res = f(buf);
let size = i32::from_usize(buf.len() - base)?;
let size = i32::try_from(buf.len() - base).expect("message too big to transmit");
(&mut buf[base..]).put_slice(&size.to_be_bytes());
Ok(())
res
}
/// Safe write of s into buf as cstring (String in the protocol).
pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
if s.contains(&0) {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
@@ -608,15 +581,11 @@ pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
Ok(())
}
// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations)
// PG protocol strings are always C strings.
fn cstr_to_str(b: &Bytes) -> Result<&str> {
let without_null = if b.last() == Some(&0) {
&b[..b.len() - 1]
} else {
&b[..]
};
std::str::from_utf8(without_null).map_err(|e| e.into())
fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
let pos = buf.iter().position(|x| *x == 0);
let result = buf.split_to(pos.context("missing terminator")?);
buf.advance(1); // drop the null terminator
Ok(result)
}
impl<'a> BeMessage<'a> {
@@ -631,18 +600,14 @@ impl<'a> BeMessage<'a> {
buf.put_u8(b'R');
write_body(buf, |buf| {
buf.put_i32(0); // Specifies that the authentication was successful.
Ok::<_, io::Error>(())
})
.unwrap(); // write into BytesMut can't fail
});
}
BeMessage::AuthenticationCleartextPassword => {
buf.put_u8(b'R');
write_body(buf, |buf| {
buf.put_i32(3); // Specifies that clear text password is required.
Ok::<_, io::Error>(())
})
.unwrap(); // write into BytesMut can't fail
});
}
BeMessage::AuthenticationMD5Password(salt) => {
@@ -650,9 +615,7 @@ impl<'a> BeMessage<'a> {
write_body(buf, |buf| {
buf.put_i32(5); // Specifies that an MD5-encrypted password is required.
buf.put_slice(&salt[..]);
Ok::<_, io::Error>(())
})
.unwrap(); // write into BytesMut can't fail
});
}
BeMessage::AuthenticationSasl(msg) => {
@@ -677,8 +640,7 @@ impl<'a> BeMessage<'a> {
}
}
Ok::<_, io::Error>(())
})
.unwrap()
})?;
}
BeMessage::BackendKeyData(key_data) => {
@@ -686,77 +648,64 @@ impl<'a> BeMessage<'a> {
write_body(buf, |buf| {
buf.put_i32(key_data.backend_pid);
buf.put_i32(key_data.cancel_key);
Ok(())
})
.unwrap();
});
}
BeMessage::BindComplete => {
buf.put_u8(b'2');
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
write_body(buf, |_| {});
}
BeMessage::CloseComplete => {
buf.put_u8(b'3');
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
write_body(buf, |_| {});
}
BeMessage::CommandComplete(cmd) => {
buf.put_u8(b'C');
write_body(buf, |buf| {
write_cstr(cmd, buf)?;
Ok::<_, io::Error>(())
})?;
write_body(buf, |buf| write_cstr(cmd, buf))?;
}
BeMessage::CopyData(data) => {
buf.put_u8(b'd');
write_body(buf, |buf| {
buf.put_slice(data);
Ok::<_, io::Error>(())
})
.unwrap();
});
}
BeMessage::CopyDone => {
buf.put_u8(b'c');
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
write_body(buf, |_| {});
}
BeMessage::CopyFail => {
buf.put_u8(b'f');
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
write_body(buf, |_| {});
}
BeMessage::CopyInResponse => {
buf.put_u8(b'G');
write_body(buf, |buf| {
buf.put_u8(1); /* copy_is_binary */
buf.put_i16(0); /* numAttributes */
Ok::<_, io::Error>(())
})
.unwrap();
buf.put_u8(1); // copy_is_binary
buf.put_i16(0); // numAttributes
});
}
BeMessage::CopyOutResponse => {
buf.put_u8(b'H');
write_body(buf, |buf| {
buf.put_u8(0); /* copy_is_binary */
buf.put_i16(0); /* numAttributes */
Ok::<_, io::Error>(())
})
.unwrap();
buf.put_u8(0); // copy_is_binary
buf.put_i16(0); // numAttributes
});
}
BeMessage::CopyBothResponse => {
buf.put_u8(b'W');
write_body(buf, |buf| {
// doesn't matter, used only for replication
buf.put_u8(0); /* copy_is_binary */
buf.put_i16(0); /* numAttributes */
Ok::<_, io::Error>(())
})
.unwrap();
buf.put_u8(0); // copy_is_binary
buf.put_i16(0); // numAttributes
});
}
BeMessage::DataRow(vals) => {
@@ -771,9 +720,7 @@ impl<'a> BeMessage<'a> {
buf.put_i32(-1);
}
}
Ok::<_, io::Error>(())
})
.unwrap();
});
}
// ErrorResponse is a zero-terminated array of zero-terminated fields.
@@ -788,18 +735,17 @@ impl<'a> BeMessage<'a> {
buf.put_u8(b'E');
write_body(buf, |buf| {
buf.put_u8(b'S'); // severity
write_cstr(&Bytes::from("ERROR"), buf)?;
buf.put_slice(b"ERROR\0");
buf.put_u8(b'C'); // SQLSTATE error code
write_cstr(&Bytes::from("CXX000"), buf)?;
buf.put_slice(b"CXX000\0");
buf.put_u8(b'M'); // the message
write_cstr(error_msg.as_bytes(), buf)?;
buf.put_u8(0); // terminator
Ok::<_, io::Error>(())
})
.unwrap();
})?;
}
// NoticeResponse has the same format as ErrorResponse. From doc: "The frontend should display the
@@ -812,23 +758,22 @@ impl<'a> BeMessage<'a> {
buf.put_u8(b'N');
write_body(buf, |buf| {
buf.put_u8(b'S'); // severity
write_cstr(&Bytes::from("NOTICE"), buf)?;
buf.put_slice(b"NOTICE\0");
buf.put_u8(b'C'); // SQLSTATE error code
write_cstr(&Bytes::from("CXX000"), buf)?;
buf.put_slice(b"CXX000\0");
buf.put_u8(b'M'); // the message
write_cstr(error_msg.as_bytes(), buf)?;
buf.put_u8(0); // terminator
Ok::<_, io::Error>(())
})
.unwrap();
})?;
}
BeMessage::NoData => {
buf.put_u8(b'n');
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
write_body(buf, |_| {});
}
BeMessage::EncryptionResponse(should_negotiate) => {
@@ -853,9 +798,7 @@ impl<'a> BeMessage<'a> {
buf.put_u8(b'S');
write_body(buf, |buf| {
buf.put_slice(&buffer[..cnt]);
Ok::<_, io::Error>(())
})
.unwrap();
});
}
BeMessage::ParameterDescription => {
@@ -863,23 +806,19 @@ impl<'a> BeMessage<'a> {
write_body(buf, |buf| {
// we don't support params, so always 0
buf.put_i16(0);
Ok::<_, io::Error>(())
})
.unwrap();
});
}
BeMessage::ParseComplete => {
buf.put_u8(b'1');
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
write_body(buf, |_| {});
}
BeMessage::ReadyForQuery => {
buf.put_u8(b'Z');
write_body(buf, |buf| {
buf.put_u8(b'I');
Ok::<_, io::Error>(())
})
.unwrap();
});
}
BeMessage::RowDescription(rows) => {
@@ -907,9 +846,7 @@ impl<'a> BeMessage<'a> {
buf.put_u64(body.wal_end);
buf.put_i64(body.timestamp);
buf.put_slice(body.data);
Ok::<_, io::Error>(())
})
.unwrap();
});
}
BeMessage::KeepAlive(req) => {
@@ -918,10 +855,8 @@ impl<'a> BeMessage<'a> {
buf.put_u8(b'k');
buf.put_u64(req.sent_ptr);
buf.put_i64(req.timestamp);
buf.put_u8(if req.request_reply { 1u8 } else { 0u8 });
Ok::<_, io::Error>(())
})
.unwrap();
buf.put_u8(if req.request_reply { 1 } else { 0 });
});
}
}
Ok(())
@@ -968,17 +903,17 @@ impl ReplicationFeedback {
// value itself
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
write_cstr(&Bytes::from("current_timeline_size"), buf)?;
buf.put_slice(b"current_timeline_size\0");
buf.put_i32(8);
buf.put_u64(self.current_timeline_size);
write_cstr(&Bytes::from("ps_writelsn"), buf)?;
buf.put_slice(b"ps_writelsn\0");
buf.put_i32(8);
buf.put_u64(self.ps_writelsn);
write_cstr(&Bytes::from("ps_flushlsn"), buf)?;
buf.put_slice(b"ps_flushlsn\0");
buf.put_i32(8);
buf.put_u64(self.ps_flushlsn);
write_cstr(&Bytes::from("ps_applylsn"), buf)?;
buf.put_slice(b"ps_applylsn\0");
buf.put_i32(8);
buf.put_u64(self.ps_applylsn);
@@ -988,7 +923,7 @@ impl ReplicationFeedback {
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
.as_micros() as i64;
write_cstr(&Bytes::from("ps_replytime"), buf)?;
buf.put_slice(b"ps_replytime\0");
buf.put_i32(8);
buf.put_i64(timestamp);
Ok(())
@@ -998,33 +933,30 @@ impl ReplicationFeedback {
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
let mut zf = ReplicationFeedback::empty();
let nfields = buf.get_u8();
let mut i = 0;
while i < nfields {
i += 1;
let key_cstr = read_null_terminated(&mut buf).unwrap();
let key = cstr_to_str(&key_cstr).unwrap();
match key {
"current_timeline_size" => {
for _ in 0..nfields {
let key = read_cstr(&mut buf).unwrap();
match key.as_ref() {
b"current_timeline_size" => {
let len = buf.get_i32();
assert_eq!(len, 8);
zf.current_timeline_size = buf.get_u64();
}
"ps_writelsn" => {
b"ps_writelsn" => {
let len = buf.get_i32();
assert_eq!(len, 8);
zf.ps_writelsn = buf.get_u64();
}
"ps_flushlsn" => {
b"ps_flushlsn" => {
let len = buf.get_i32();
assert_eq!(len, 8);
zf.ps_flushlsn = buf.get_u64();
}
"ps_applylsn" => {
b"ps_applylsn" => {
let len = buf.get_i32();
assert_eq!(len, 8);
zf.ps_applylsn = buf.get_u64();
}
"ps_replytime" => {
b"ps_replytime" => {
let len = buf.get_i32();
assert_eq!(len, 8);
let raw_time = buf.get_i64();
@@ -1037,8 +969,8 @@ impl ReplicationFeedback {
_ => {
let len = buf.get_i32();
warn!(
"ReplicationFeedback parse. unknown key {} of len {}. Skip it.",
key, len
"ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
String::from_utf8_lossy(key.as_ref())
);
buf.advance(len as usize);
}
@@ -1084,7 +1016,7 @@ mod tests {
*first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
}
write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
data.put_slice(b"new_field_one\0");
data.put_i32(8);
data.put_u64(42);
@@ -1093,6 +1025,33 @@ mod tests {
assert_eq!(zf, zf_parsed);
}
#[test]
fn test_startup_message_params_options_escaped() {
fn split_options(params: &StartupMessageParams) -> Vec<Cow<'_, str>> {
params
.options_escaped()
.expect("options are None")
.collect()
}
let make_params = |options| StartupMessageParams::new([("options", options)]);
let params = StartupMessageParams::new([]);
assert!(matches!(params.options_escaped(), None));
let params = make_params("");
assert!(split_options(&params).is_empty());
let params = make_params("foo");
assert_eq!(split_options(&params), ["foo"]);
let params = make_params(" foo bar ");
assert_eq!(split_options(&params), ["foo", "bar"]);
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
}
// Make sure that `read` is sync/async callable
async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) {
let _ = FeMessage::read(&mut [].as_ref());

View File

@@ -1,8 +1,8 @@
///
/// Async version of 'seqwait.rs'
///
/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
///
//!
//! Async version of 'seqwait.rs'
//!
//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
//!
#![warn(missing_docs)]

View File

@@ -0,0 +1,217 @@
//!
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
//! without blocking writers, and allows writing a new values without blocking
//! readers. When you update the new value, the new value is immediately visible
//! to new readers, but the update waits until all existing readers have
//! finishe, so that no one sees the old value anymore.
//!
//! This implementation isn't wait-free; it uses an RwLock that is held for a
//! short duration when the value is read or updated.
//!
#![warn(missing_docs)]
use std::ops::Deref;
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
use std::sync::{Arc, Weak};
use std::sync::{Mutex, RwLock, RwLockWriteGuard};
///
/// Rcu allows multiple readers to read and hold onto a value without blocking
/// (for very long). Storing to the Rcu updates the value, making new readers
/// immediately see the new value, but it also waits for all current readers to
/// finish.
///
pub struct Rcu<V> {
inner: RwLock<RcuInner<V>>,
}
struct RcuInner<V> {
current_cell: Arc<RcuCell<V>>,
old_cells: Vec<Weak<RcuCell<V>>>,
}
///
/// RcuCell holds one value. It can be the latest one, or an old one.
///
struct RcuCell<V> {
value: V,
/// A dummy channel. We never send anything to this channel. The point is
/// that when the RcuCell is dropped, any cloned Senders will be notified
/// that the channel is closed. Updaters can use this to wait out until the
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
///
/// We never do anything with the receiver, we just need to hold onto it so
/// that the Senders will be notified when it's dropped. But because it's
/// not Sync, we need a Mutex on it.
watch: (SyncSender<()>, Mutex<Receiver<()>>),
}
impl<V> RcuCell<V> {
fn new(value: V) -> Self {
let (watch_sender, watch_receiver) = sync_channel(0);
RcuCell {
value,
watch: (watch_sender, Mutex::new(watch_receiver)),
}
}
}
impl<V> Rcu<V> {
/// Create a new `Rcu`, initialized to `starting_val`
pub fn new(starting_val: V) -> Self {
let inner = RcuInner {
current_cell: Arc::new(RcuCell::new(starting_val)),
old_cells: Vec::new(),
};
Self {
inner: RwLock::new(inner),
}
}
///
/// Read current value. Any store() calls will block until the returned
/// guard object is dropped.
///
pub fn read(&self) -> RcuReadGuard<V> {
let current_cell = Arc::clone(&self.inner.read().unwrap().current_cell);
RcuReadGuard { cell: current_cell }
}
///
/// Lock the current value for updating. Returns a guard object that can be
/// used to read the current value, and to store a new value.
///
/// Note: holding the write-guard blocks concurrent readers, so you should
/// finish the update and drop the guard quickly!
///
pub fn write(&self) -> RcuWriteGuard<'_, V> {
let inner = self.inner.write().unwrap();
RcuWriteGuard { inner }
}
}
///
/// Read guard returned by `read`
///
pub struct RcuReadGuard<V> {
cell: Arc<RcuCell<V>>,
}
impl<V> Deref for RcuReadGuard<V> {
type Target = V;
fn deref(&self) -> &V {
&self.cell.value
}
}
///
/// Read guard returned by `read`
///
pub struct RcuWriteGuard<'a, V> {
inner: RwLockWriteGuard<'a, RcuInner<V>>,
}
impl<'a, V> Deref for RcuWriteGuard<'a, V> {
type Target = V;
fn deref(&self) -> &V {
&self.inner.current_cell.value
}
}
impl<'a, V> RcuWriteGuard<'a, V> {
///
/// Store a new value. The new value will be written to the Rcu immediately,
/// and will be immediately seen by any `read` calls that start afterwards.
/// But if there are any readers still holding onto the old value, or any
/// even older values, this will await until they have been released.
///
/// This will drop the write-guard before it starts waiting for the reads to
/// finish, so a new write operation can begin before this functio returns.
///
pub fn store(mut self, new_val: V) {
let new_cell = Arc::new(RcuCell::new(new_val));
let mut watches = Vec::new();
{
let old = std::mem::replace(&mut self.inner.current_cell, new_cell);
self.inner.old_cells.push(Arc::downgrade(&old));
// cleanup old cells that no longer have any readers, and collect
// the watches for any that do.
self.inner.old_cells.retain(|weak| {
if let Some(cell) = weak.upgrade() {
watches.push(cell.watch.0.clone());
true
} else {
false
}
});
}
drop(self);
// after all the old_cells are no longer in use, we're done
for w in watches.iter_mut() {
// This will block until the Receiver is closed. That happens then
// the RcuCell is dropped.
#[allow(clippy::single_match)]
match w.send(()) {
Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
Err(_) => {
// closed, which means that the cell has been dropped, and
// its value is no longer in use
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::{Arc, Mutex};
use std::thread::{sleep, spawn};
use std::time::Duration;
#[test]
fn basic() {
let rcu = Arc::new(Rcu::new(1));
let log = Arc::new(Mutex::new(Vec::new()));
let a = rcu.read();
assert_eq!(*a, 1);
log.lock().unwrap().push("one");
let (rcu_clone, log_clone) = (Arc::clone(&rcu), Arc::clone(&log));
let thread = spawn(move || {
log_clone.lock().unwrap().push("store two start");
let write_guard = rcu_clone.write();
assert_eq!(*write_guard, 1);
write_guard.store(2);
log_clone.lock().unwrap().push("store two done");
});
// without this sleep the test can pass on accident if the writer is slow
sleep(Duration::from_secs(1));
// new read should see the new value
let b = rcu.read();
assert_eq!(*b, 2);
// old guard still sees the old value
assert_eq!(*a, 1);
// Release the old guard. This lets the store in the thread to finish.
log.lock().unwrap().push("release a");
drop(a);
thread.join().unwrap();
assert_eq!(
log.lock().unwrap().as_slice(),
&["one", "store two start", "release a", "store two done",]
);
}
}

View File

@@ -15,7 +15,7 @@ failpoints = ["fail/failpoints"]
chrono = "0.4.19"
rand = "0.8.3"
regex = "1.4.5"
bytes = { version = "1.0.1", features = ['serde'] }
bytes = "1.0.1"
byteorder = "1.4.3"
futures = "0.3.13"
hex = "0.4.3"

View File

@@ -22,8 +22,8 @@ use std::time::SystemTime;
use tar::{Builder, EntryType, Header};
use tracing::*;
use crate::layered_repository::Timeline;
use crate::reltag::{RelTag, SlruKind};
use crate::DatadirTimeline;
use postgres_ffi::v14::pg_constants;
use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName};
@@ -36,13 +36,12 @@ use utils::lsn::Lsn;
/// This is short-living object only for the time of tarball creation,
/// created mostly to avoid passing a lot of parameters between various functions
/// used for constructing tarball.
pub struct Basebackup<'a, W, T>
pub struct Basebackup<'a, W>
where
W: Write,
T: DatadirTimeline,
{
ar: Builder<AbortableWrite<W>>,
timeline: &'a Arc<T>,
timeline: &'a Arc<Timeline>,
pub lsn: Lsn,
prev_record_lsn: Lsn,
full_backup: bool,
@@ -57,18 +56,17 @@ where
// * When working without safekeepers. In this situation it is important to match the lsn
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
// to start the replication.
impl<'a, W, T> Basebackup<'a, W, T>
impl<'a, W> Basebackup<'a, W>
where
W: Write,
T: DatadirTimeline,
{
pub fn new(
write: W,
timeline: &'a Arc<T>,
timeline: &'a Arc<Timeline>,
req_lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
full_backup: bool,
) -> Result<Basebackup<'a, W, T>> {
) -> Result<Basebackup<'a, W>> {
// Compute postgres doesn't have any previous WAL files, but the first
// record that it's going to write needs to include the LSN of the
// previous record (xl_prev). We include prev_record_lsn in the
@@ -404,10 +402,9 @@ where
}
}
impl<'a, W, T> Drop for Basebackup<'a, W, T>
impl<'a, W> Drop for Basebackup<'a, W>
where
W: Write,
T: DatadirTimeline,
{
/// If the basebackup was not finished, prevent the Archive::drop() from
/// writing the end-of-archive marker.

View File

@@ -1,6 +1,7 @@
//! Main entry point for the Page Server executable.
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
use remote_storage::GenericRemoteStorage;
use std::{env, ops::ControlFlow, path::Path, str::FromStr, sync::Arc};
use tracing::*;
use anyhow::{bail, Context, Result};
@@ -298,7 +299,14 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
};
info!("Using auth: {:#?}", conf.auth_type);
let remote_index = tenant_mgr::init_tenant_mgr(conf)?;
let remote_storage = conf
.remote_storage_config
.as_ref()
.map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
.transpose()
.context("Failed to init generic remote storage")?
.map(Arc::new);
let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.as_ref().map(Arc::clone))?;
// Spawn a new thread for the http endpoint
// bind before launching separate thread so the error reported before startup exits
@@ -310,7 +318,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
"http_endpoint_thread",
true,
move || {
let router = http::make_router(conf, auth_cloned, remote_index)?;
let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?;
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
},
)?;

View File

@@ -129,9 +129,9 @@ pub struct LocalTimelineInfo {
pub latest_gc_cutoff_lsn: Lsn,
#[serde_as(as = "DisplayFromStr")]
pub disk_consistent_lsn: Lsn,
pub current_logical_size: Option<usize>, // is None when timeline is Unloaded
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
pub current_logical_size_non_incremental: Option<usize>,
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
pub current_logical_size_non_incremental: Option<u64>,
pub current_physical_size_non_incremental: Option<u64>,
pub timeline_state: LocalTimelineState,
@@ -150,6 +150,9 @@ pub struct RemoteTimelineInfo {
pub awaits_download: bool,
}
///
/// This represents the output of the "timeline_detail" API call.
///
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {

View File

@@ -11,10 +11,8 @@ use super::models::{
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
TimelineCreateRequest,
};
use crate::layered_repository::{metadata::TimelineMetadata, LayeredTimeline};
use crate::pgdatadir_mapping::DatadirTimeline;
use crate::layered_repository::{metadata::TimelineMetadata, Timeline};
use crate::repository::{LocalTimelineState, RepositoryTimeline};
use crate::repository::{Repository, Timeline};
use crate::storage_sync;
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
use crate::tenant_config::TenantConfOpt;
@@ -37,7 +35,7 @@ struct State {
auth: Option<Arc<JwtAuth>>,
remote_index: RemoteIndex,
allowlist_routes: Vec<Uri>,
remote_storage: Option<GenericRemoteStorage>,
remote_storage: Option<Arc<GenericRemoteStorage>>,
}
impl State {
@@ -45,20 +43,12 @@ impl State {
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
remote_index: RemoteIndex,
remote_storage: Option<Arc<GenericRemoteStorage>>,
) -> anyhow::Result<Self> {
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
.iter()
.map(|v| v.parse().unwrap())
.collect::<Vec<_>>();
// Note that this remote storage is created separately from the main one in the sync_loop.
// It's fine since it's stateless and some code duplication saves us from bloating the code around with generics.
let remote_storage = conf
.remote_storage_config
.as_ref()
.map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
.transpose()
.context("Failed to init generic remote storage")?;
Ok(Self {
conf,
auth,
@@ -85,7 +75,7 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
// Helper functions to construct a LocalTimelineInfo struct for a timeline
fn local_timeline_info_from_loaded_timeline(
timeline: &LayeredTimeline,
timeline: &Timeline,
include_non_incremental_logical_size: bool,
include_non_incremental_physical_size: bool,
) -> anyhow::Result<LocalTimelineInfo> {
@@ -160,7 +150,7 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo
}
fn local_timeline_info_from_repo_timeline(
repo_timeline: &RepositoryTimeline<LayeredTimeline>,
repo_timeline: &RepositoryTimeline<Timeline>,
include_non_incremental_logical_size: bool,
include_non_incremental_physical_size: bool,
) -> anyhow::Result<LocalTimelineInfo> {
@@ -208,7 +198,6 @@ async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiErr
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_id))?;
let new_timeline_info = tokio::task::spawn_blocking(move || {
@@ -246,11 +235,12 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let include_non_incremental_logical_size =
query_param_present(&request, "include-non-incremental-logical-size");
let include_non_incremental_physical_size =
query_param_present(&request, "include-non-incremental-physical-size");
check_permission(&request, Some(tenant_id))?;
let local_timeline_infos = tokio::task::spawn_blocking(move || {
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
list_local_timelines(
@@ -301,13 +291,12 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
let include_non_incremental_logical_size =
query_param_present(&request, "include-non-incremental-logical-size");
let include_non_incremental_physical_size =
query_param_present(&request, "include-non-incremental-physical-size");
check_permission(&request, Some(tenant_id))?;
let (local_timeline_info, remote_timeline_info) = async {
// any error here will render local timeline as None
@@ -371,7 +360,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
info!("Handling tenant attach {}", tenant_id,);
info!("Handling tenant attach {}", tenant_id);
tokio::task::spawn_blocking(move || {
if tenant_mgr::get_tenant_state(tenant_id).is_some() {
@@ -451,16 +440,8 @@ async fn gather_tenant_timelines_index_parts(
tenant_id: ZTenantId,
) -> anyhow::Result<Option<Vec<(ZTimelineId, RemoteTimeline)>>> {
let index_parts = match state.remote_storage.as_ref() {
Some(GenericRemoteStorage::Local(local_storage)) => {
storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id)
.await
}
// FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones
// because it is a different instance. We can move this limit to some global static
// or use one instance everywhere.
Some(GenericRemoteStorage::S3(s3_storage)) => {
storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id)
.await
Some(storage) => {
storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await
}
None => return Ok(None),
}
@@ -480,9 +461,8 @@ async fn gather_tenant_timelines_index_parts(
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
tokio::task::spawn_blocking(move || {
@@ -521,7 +501,6 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
}
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
// check for management permission
check_permission(&request, None)?;
let state = get_state(&request);
@@ -589,7 +568,6 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
}
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
// check for management permission
check_permission(&request, None)?;
let request_data: TenantCreateRequest = json_request(&mut request).await?;
@@ -658,7 +636,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
let request_data: TenantConfigRequest = json_request(&mut request).await?;
let tenant_id = request_data.tenant_id;
// check for management permission
check_permission(&request, Some(tenant_id))?;
let mut tenant_conf: TenantConfOpt = Default::default();
@@ -721,6 +698,7 @@ pub fn make_router(
conf: &'static PageServerConf,
auth: Option<Arc<JwtAuth>>,
remote_index: RemoteIndex,
remote_storage: Option<Arc<GenericRemoteStorage>>,
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
let spec = include_bytes!("openapi_spec.yml");
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -737,7 +715,8 @@ pub fn make_router(
Ok(router
.data(Arc::new(
State::new(conf, auth, remote_index).context("Failed to initialize router state")?,
State::new(conf, auth, remote_index, remote_storage)
.context("Failed to initialize router state")?,
))
.get("/v1/status", status_handler)
.get("/v1/tenant", tenant_list_handler)

View File

@@ -11,6 +11,7 @@ use bytes::Bytes;
use tracing::*;
use walkdir::WalkDir;
use crate::layered_repository::Timeline;
use crate::pgdatadir_mapping::*;
use crate::reltag::{RelTag, SlruKind};
use crate::walingest::WalIngest;
@@ -39,9 +40,9 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
/// This is currently only used to import a cluster freshly created by initdb.
/// The code that deals with the checkpoint would not work right if the
/// cluster was not shut down cleanly.
pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
pub fn import_timeline_from_postgres_datadir(
path: &Path,
tline: &T,
tline: &Timeline,
lsn: Lsn,
) -> Result<()> {
let mut pg_control: Option<ControlFileData> = None;
@@ -99,8 +100,8 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
}
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
fn import_rel<T: DatadirTimeline, Reader: Read>(
modification: &mut DatadirModification<T>,
fn import_rel<Reader: Read>(
modification: &mut DatadirModification,
path: &Path,
spcoid: Oid,
dboid: Oid,
@@ -178,8 +179,8 @@ fn import_rel<T: DatadirTimeline, Reader: Read>(
/// Import an SLRU segment file
///
fn import_slru<T: DatadirTimeline, Reader: Read>(
modification: &mut DatadirModification<T>,
fn import_slru<Reader: Read>(
modification: &mut DatadirModification,
slru: SlruKind,
path: &Path,
mut reader: Reader,
@@ -234,12 +235,7 @@ fn import_slru<T: DatadirTimeline, Reader: Read>(
/// Scan PostgreSQL WAL files in given directory and load all records between
/// 'startpoint' and 'endpoint' into the repository.
fn import_wal<T: DatadirTimeline>(
walpath: &Path,
tline: &T,
startpoint: Lsn,
endpoint: Lsn,
) -> Result<()> {
fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
let mut waldecoder = WalStreamDecoder::new(startpoint);
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
@@ -305,12 +301,12 @@ fn import_wal<T: DatadirTimeline>(
Ok(())
}
pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
tline: &T,
pub fn import_basebackup_from_tar<Reader: Read>(
tline: &Timeline,
reader: Reader,
base_lsn: Lsn,
) -> Result<()> {
info!("importing base at {}", base_lsn);
info!("importing base at {base_lsn}");
let mut modification = tline.begin_modification(base_lsn);
modification.init_empty()?;
@@ -335,7 +331,11 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
debug!("directory {:?}", file_path);
}
_ => {
panic!("tar::EntryType::?? {}", file_path.display());
bail!(
"entry {} in backup tar archive is of unexpected type: {:?}",
file_path.display(),
header.entry_type()
);
}
}
}
@@ -347,8 +347,8 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
Ok(())
}
pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
tline: &T,
pub fn import_wal_from_tar<Reader: Read>(
tline: &Timeline,
reader: Reader,
start_lsn: Lsn,
end_lsn: Lsn,
@@ -388,7 +388,11 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
continue;
}
_ => {
panic!("tar::EntryType::?? {}", file_path.display());
bail!(
"entry {} in WAL tar archive is of unexpected type: {:?}",
file_path.display(),
header.entry_type()
);
}
}
};
@@ -428,14 +432,12 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
Ok(())
}
pub fn import_file<T: DatadirTimeline, Reader: Read>(
modification: &mut DatadirModification<T>,
fn import_file<Reader: Read>(
modification: &mut DatadirModification,
file_path: &Path,
reader: Reader,
len: usize,
) -> Result<Option<ControlFileData>> {
debug!("looking at {:?}", file_path);
if file_path.starts_with("global") {
let spcnode = pg_constants::GLOBALTABLESPACE_OID;
let dbnode = 0;
@@ -557,7 +559,10 @@ pub fn import_file<T: DatadirTimeline, Reader: Read>(
// this to import arbitrary postgres databases.
bail!("Importing pg_tblspc is not implemented");
} else {
debug!("ignored");
debug!(
"ignoring unrecognized file \"{}\" in tar archive",
file_path.display()
);
}
Ok(None)

View File

@@ -13,6 +13,7 @@
use anyhow::{bail, ensure, Context, Result};
use tracing::*;
use utils::zid::ZTenantTimelineId;
use std::cmp::min;
use std::collections::hash_map::Entry;
@@ -31,7 +32,8 @@ use crate::config::PageServerConf;
use crate::storage_sync::index::RemoteIndex;
use crate::tenant_config::{TenantConf, TenantConfOpt};
use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline};
use crate::repository::{GcResult, RepositoryTimeline};
use crate::tenant_mgr::LocalTimelineUpdate;
use crate::thread_mgr;
use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
@@ -61,13 +63,13 @@ mod timeline;
use storage_layer::Layer;
use timeline::LayeredTimelineEntry;
pub use timeline::LayeredTimeline;
pub use timeline::Timeline;
// re-export this function so that page_cache.rs can use it.
pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file;
// re-export for use in storage_sync.rs
pub use crate::layered_repository::timeline::save_metadata;
pub use crate::layered_repository::metadata::save_metadata;
// re-export for use in walreceiver
pub use crate::layered_repository::timeline::WalReceiverInfo;
@@ -78,7 +80,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
///
/// Repository consists of multiple timelines. Keep them in a hash table.
///
pub struct LayeredRepository {
pub struct Repository {
// Global pageserver config parameters
pub conf: &'static PageServerConf,
@@ -119,17 +121,22 @@ pub struct LayeredRepository {
upload_layers: bool,
}
/// Public interface
impl Repository for LayeredRepository {
type Timeline = LayeredTimeline;
fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Self::Timeline>> {
let timelines = self.timelines.lock().unwrap();
self.get_timeline_internal(timelineid, &timelines)
/// A repository corresponds to one .neon directory. One repository holds multiple
/// timelines, forked off from the same initial call to 'initdb'.
impl Repository {
/// Get Timeline handle for given zenith timeline ID.
/// This function is idempotent. It doesn't change internal state in any way.
pub fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Timeline>> {
self.timelines
.lock()
.unwrap()
.get(&timelineid)
.cloned()
.map(RepositoryTimeline::from)
}
fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<LayeredTimeline>> {
/// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded.
pub fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<Timeline>> {
let mut timelines = self.timelines.lock().unwrap();
match self.get_timeline_load_internal(timelineid, &mut timelines)? {
Some(local_loaded_timeline) => Ok(local_loaded_timeline),
@@ -140,7 +147,9 @@ impl Repository for LayeredRepository {
}
}
fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Self::Timeline>)> {
/// Lists timelines the repository contains.
/// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
pub fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Timeline>)> {
self.timelines
.lock()
.unwrap()
@@ -154,11 +163,13 @@ impl Repository for LayeredRepository {
.collect()
}
fn create_empty_timeline(
/// Create a new, empty timeline. The caller is responsible for loading data into it
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
pub fn create_empty_timeline(
&self,
timeline_id: ZTimelineId,
initdb_lsn: Lsn,
) -> Result<Arc<LayeredTimeline>> {
) -> Result<Arc<Timeline>> {
let mut timelines = self.timelines.lock().unwrap();
let vacant_timeline_entry = match timelines.entry(timeline_id) {
Entry::Occupied(_) => bail!("Timeline already exists"),
@@ -174,9 +185,9 @@ impl Repository for LayeredRepository {
crashsafe_dir::create_dir_all(timeline_path)?;
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?;
save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?;
let timeline = LayeredTimeline::new(
let timeline = Timeline::new(
self.conf,
Arc::clone(&self.tenant_conf),
metadata,
@@ -192,11 +203,16 @@ impl Repository for LayeredRepository {
let timeline = Arc::new(timeline);
vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)));
crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach {
id: ZTenantTimelineId::new(self.tenant_id(), timeline_id),
timeline: Arc::clone(&timeline),
});
Ok(timeline)
}
/// Branch a timeline
fn branch_timeline(
pub fn branch_timeline(
&self,
src: ZTimelineId,
dst: ZTimelineId,
@@ -238,7 +254,8 @@ impl Repository for LayeredRepository {
src_timeline
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
.context(format!(
"invalid branch start lsn: less than latest GC cutoff {latest_gc_cutoff_lsn}"
"invalid branch start lsn: less than latest GC cutoff {}",
*latest_gc_cutoff_lsn
))?;
{
let gc_info = src_timeline.gc_info.read().unwrap();
@@ -274,11 +291,11 @@ impl Repository for LayeredRepository {
dst_prev,
Some(src),
start_lsn,
*src_timeline.latest_gc_cutoff_lsn.read().unwrap(),
*src_timeline.latest_gc_cutoff_lsn.read(),
src_timeline.initdb_lsn,
);
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?;
timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?;
save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?;
timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata });
info!("branched timeline {} from {} at {}", dst, src, start_lsn);
@@ -286,10 +303,16 @@ impl Repository for LayeredRepository {
Ok(())
}
/// Public entry point to GC. All the logic is in the private
/// gc_iteration_internal function, this public facade just wraps it for
/// metrics collection.
fn gc_iteration(
/// perform one garbage collection iteration, removing old data files from disk.
/// this function is periodically called by gc thread.
/// also it can be explicitly requested through page server api 'do_gc' command.
///
/// 'timelineid' specifies the timeline to GC, or None for all.
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
/// to make tests more deterministic.
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
pub fn gc_iteration(
&self,
target_timeline_id: Option<ZTimelineId>,
horizon: u64,
@@ -307,7 +330,11 @@ impl Repository for LayeredRepository {
})
}
fn compaction_iteration(&self) -> Result<()> {
/// Perform one compaction iteration.
/// This function is periodically called by compactor thread.
/// Also it can be explicitly requested per timeline through page server
/// api's 'compact' command.
pub fn compaction_iteration(&self) -> Result<()> {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// compactions. We don't want to block everything else while the
@@ -335,12 +362,11 @@ impl Repository for LayeredRepository {
Ok(())
}
///
/// Flush all in-memory data to disk.
///
/// Used at shutdown.
/// Used at graceful shutdown.
///
fn checkpoint(&self) -> Result<()> {
pub fn checkpoint(&self) -> Result<()> {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
// checkpoints. We don't want to block everything else while the
@@ -370,7 +396,8 @@ impl Repository for LayeredRepository {
Ok(())
}
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
/// Removes timeline-related in-memory data
pub fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
// in order to be retriable detach needs to be idempotent
// (or at least to a point that each time the detach is called it can make progress)
let mut timelines = self.timelines.lock().unwrap();
@@ -407,7 +434,9 @@ impl Repository for LayeredRepository {
Ok(())
}
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
/// See [`crate::remote_storage`] for more details about the synchronization.
pub fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
debug!("attach timeline_id: {}", timeline_id,);
match self.timelines.lock().unwrap().entry(timeline_id) {
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
@@ -421,13 +450,14 @@ impl Repository for LayeredRepository {
Ok(())
}
fn get_remote_index(&self) -> &RemoteIndex {
/// Allows to retrieve remote timeline index from the tenant. Used in walreceiver to grab remote consistent lsn.
pub fn get_remote_index(&self) -> &RemoteIndex {
&self.remote_index
}
}
/// Private functions
impl LayeredRepository {
impl Repository {
pub fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
@@ -517,49 +547,37 @@ impl LayeredRepository {
tenant_conf.update(&new_tenant_conf);
LayeredRepository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?;
Repository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?;
Ok(())
}
// Implementation of the public `get_timeline` function.
// Differences from the public:
// * interface in that the caller must already hold the mutex on the 'timelines' hashmap.
fn get_timeline_internal(
&self,
timelineid: ZTimelineId,
timelines: &HashMap<ZTimelineId, LayeredTimelineEntry>,
) -> Option<LayeredTimelineEntry> {
timelines.get(&timelineid).cloned()
}
// Implementation of the public `get_timeline_load` function.
// Differences from the public:
// * interface in that the caller must already hold the mutex on the 'timelines' hashmap.
fn get_timeline_load_internal(
&self,
timelineid: ZTimelineId,
timeline_id: ZTimelineId,
timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
) -> anyhow::Result<Option<Arc<LayeredTimeline>>> {
match timelines.get(&timelineid) {
) -> anyhow::Result<Option<Arc<Timeline>>> {
match timelines.get(&timeline_id) {
Some(entry) => match entry {
LayeredTimelineEntry::Loaded(local_timeline) => {
debug!("timeline {} found loaded into memory", &timelineid);
debug!("timeline {timeline_id} found loaded into memory");
return Ok(Some(Arc::clone(local_timeline)));
}
LayeredTimelineEntry::Unloaded { .. } => {}
},
None => {
debug!("timeline {} not found", &timelineid);
debug!("timeline {timeline_id} not found");
return Ok(None);
}
};
debug!(
"timeline {} found on a local disk, but not loaded into the memory, loading",
&timelineid
"timeline {timeline_id} found on a local disk, but not loaded into the memory, loading"
);
let timeline = self.load_local_timeline(timelineid, timelines)?;
let timeline = self.load_local_timeline(timeline_id, timelines)?;
let was_loaded = timelines.insert(
timelineid,
timeline_id,
LayeredTimelineEntry::Loaded(Arc::clone(&timeline)),
);
ensure!(
@@ -574,7 +592,7 @@ impl LayeredRepository {
&self,
timeline_id: ZTimelineId,
timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
) -> anyhow::Result<Arc<LayeredTimeline>> {
) -> anyhow::Result<Arc<Timeline>> {
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
.context("failed to load metadata")?;
let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -591,7 +609,7 @@ impl LayeredRepository {
.map(LayeredTimelineEntry::Loaded);
let _enter = info_span!("loading local timeline").entered();
let timeline = LayeredTimeline::new(
let timeline = Timeline::new(
self.conf,
Arc::clone(&self.tenant_conf),
metadata,
@@ -605,7 +623,14 @@ impl LayeredRepository {
.load_layer_map(disk_consistent_lsn)
.context("failed to load layermap")?;
Ok(Arc::new(timeline))
let timeline = Arc::new(timeline);
crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach {
id: ZTenantTimelineId::new(self.tenant_id(), timeline_id),
timeline: Arc::clone(&timeline),
});
Ok(timeline)
}
pub fn new(
@@ -615,8 +640,8 @@ impl LayeredRepository {
tenant_id: ZTenantId,
remote_index: RemoteIndex,
upload_layers: bool,
) -> LayeredRepository {
LayeredRepository {
) -> Repository {
Repository {
tenant_id,
file_lock: RwLock::new(()),
conf,
@@ -632,9 +657,9 @@ impl LayeredRepository {
/// Locate and load config
pub fn load_tenant_config(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tenant_id: ZTenantId,
) -> anyhow::Result<TenantConfOpt> {
let target_config_path = TenantConf::path(conf, tenantid);
let target_config_path = TenantConf::path(conf, tenant_id);
info!("load tenantconf from {}", target_config_path.display());
@@ -669,11 +694,11 @@ impl LayeredRepository {
pub fn persist_tenant_config(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tenant_id: ZTenantId,
tenant_conf: TenantConfOpt,
) -> anyhow::Result<()> {
let _enter = info_span!("saving tenantconf").entered();
let target_config_path = TenantConf::path(conf, tenantid);
let target_config_path = TenantConf::path(conf, tenant_id);
info!("save tenantconf to {}", target_config_path.display());
let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -810,7 +835,7 @@ impl LayeredRepository {
// compaction (both require `layer_removal_cs` lock),
// but the GC iteration can run concurrently with branch creation.
//
// See comments in [`LayeredRepository::branch_timeline`] for more information
// See comments in [`Repository::branch_timeline`] for more information
// about why branch creation task can run concurrently with timeline's GC iteration.
for timeline in gc_timelines {
if thread_mgr::is_shutdown_requested() {
@@ -886,22 +911,525 @@ pub fn load_metadata(
})
}
///
/// Tests that are specific to the layered storage format.
///
/// There are more unit tests in repository.rs that work through the
/// Repository interface and are expected to work regardless of the
/// file format and directory layout. The test here are more low level.
///
#[cfg(test)]
pub mod tests {
pub mod repo_harness {
use bytes::{Bytes, BytesMut};
use once_cell::sync::Lazy;
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
use std::{fs, path::PathBuf};
use utils::lsn::Lsn;
use crate::storage_sync::index::RemoteIndex;
use crate::{
config::PageServerConf,
layered_repository::Repository,
repository::Key,
walrecord::ZenithWalRecord,
walredo::{WalRedoError, WalRedoManager},
};
use super::*;
use crate::tenant_config::{TenantConf, TenantConfOpt};
use hex_literal::hex;
use utils::zid::{ZTenantId, ZTimelineId};
pub const TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
pub const NEW_TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
/// Convenience function to create a page image with given string as the only content
#[allow(non_snake_case)]
pub fn TEST_IMG(s: &str) -> Bytes {
let mut buf = BytesMut::new();
buf.extend_from_slice(s.as_bytes());
buf.resize(64, 0);
buf.freeze()
}
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
impl From<TenantConf> for TenantConfOpt {
fn from(tenant_conf: TenantConf) -> Self {
Self {
checkpoint_distance: Some(tenant_conf.checkpoint_distance),
checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
compaction_target_size: Some(tenant_conf.compaction_target_size),
compaction_period: Some(tenant_conf.compaction_period),
compaction_threshold: Some(tenant_conf.compaction_threshold),
gc_horizon: Some(tenant_conf.gc_horizon),
gc_period: Some(tenant_conf.gc_period),
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
pitr_interval: Some(tenant_conf.pitr_interval),
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
}
}
}
pub struct RepoHarness<'a> {
pub conf: &'static PageServerConf,
pub tenant_conf: TenantConf,
pub tenant_id: ZTenantId,
pub lock_guard: (
Option<RwLockReadGuard<'a, ()>>,
Option<RwLockWriteGuard<'a, ()>>,
),
}
impl<'a> RepoHarness<'a> {
pub fn create(test_name: &'static str) -> Result<Self> {
Self::create_internal(test_name, false)
}
pub fn create_exclusive(test_name: &'static str) -> Result<Self> {
Self::create_internal(test_name, true)
}
fn create_internal(test_name: &'static str, exclusive: bool) -> Result<Self> {
let lock_guard = if exclusive {
(None, Some(LOCK.write().unwrap()))
} else {
(Some(LOCK.read().unwrap()), None)
};
let repo_dir = PageServerConf::test_repo_dir(test_name);
let _ = fs::remove_dir_all(&repo_dir);
fs::create_dir_all(&repo_dir)?;
let conf = PageServerConf::dummy_conf(repo_dir);
// Make a static copy of the config. This can never be free'd, but that's
// OK in a test.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let tenant_conf = TenantConf::dummy_conf();
let tenant_id = ZTenantId::generate();
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
Ok(Self {
conf,
tenant_conf,
tenant_id,
lock_guard,
})
}
pub fn load(&self) -> Repository {
self.try_load().expect("failed to load test repo")
}
pub fn try_load(&self) -> Result<Repository> {
let walredo_mgr = Arc::new(TestRedoManager);
let repo = Repository::new(
self.conf,
TenantConfOpt::from(self.tenant_conf),
walredo_mgr,
self.tenant_id,
RemoteIndex::default(),
false,
);
// populate repo with locally available timelines
for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
.expect("should be able to read timelines dir")
{
let timeline_dir_entry = timeline_dir_entry.unwrap();
let timeline_id: ZTimelineId = timeline_dir_entry
.path()
.file_name()
.unwrap()
.to_string_lossy()
.parse()
.unwrap();
repo.attach_timeline(timeline_id)?;
}
Ok(repo)
}
pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
self.conf.timeline_path(timeline_id, &self.tenant_id)
}
}
// Mock WAL redo manager that doesn't do much
pub struct TestRedoManager;
impl WalRedoManager for TestRedoManager {
fn request_redo(
&self,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<(Lsn, ZenithWalRecord)>,
) -> Result<Bytes, WalRedoError> {
let s = format!(
"redo for {} to get to {}, with {} and {} records",
key,
lsn,
if base_img.is_some() {
"base image"
} else {
"no base image"
},
records.len()
);
println!("{}", s);
Ok(TEST_IMG(&s))
}
}
}
#[cfg(test)]
mod tests {
use super::metadata::METADATA_FILE_NAME;
use super::*;
use crate::keyspace::KeySpaceAccum;
use crate::repository::repo_harness::*;
use crate::layered_repository::repo_harness::*;
use crate::repository::{Key, Value};
use bytes::BytesMut;
use hex_literal::hex;
use once_cell::sync::Lazy;
use rand::{thread_rng, Rng};
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
#[test]
fn test_basic() -> Result<()> {
let repo = RepoHarness::create("test_basic")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
writer.finish_write(Lsn(0x10));
drop(writer);
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
writer.finish_write(Lsn(0x20));
drop(writer);
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
Ok(())
}
#[test]
fn no_duplicate_timelines() -> Result<()> {
let repo = RepoHarness::create("no_duplicate_timelines")?.load();
let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
}
Ok(())
}
/// Convenience function to create a page image with given string as the only content
pub fn test_value(s: &str) -> Value {
let mut buf = BytesMut::new();
buf.extend_from_slice(s.as_bytes());
Value::Image(buf.freeze())
}
///
/// Test branch creation
///
#[test]
fn test_branch() -> Result<()> {
let repo = RepoHarness::create("test_branch")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
let writer = tline.writer();
use std::str::from_utf8;
#[allow(non_snake_case)]
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
#[allow(non_snake_case)]
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
// Insert a value on the timeline
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
writer.finish_write(Lsn(0x20));
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
writer.finish_write(Lsn(0x30));
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
writer.finish_write(Lsn(0x40));
//assert_current_logical_size(&tline, Lsn(0x40));
// Branch the history, modify relation differently on the new timeline
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
let new_writer = newtline.writer();
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
new_writer.finish_write(Lsn(0x40));
// Check page contents on both branches
assert_eq!(
from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
"foo at 0x40"
);
assert_eq!(
from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
"bar at 0x40"
);
assert_eq!(
from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
"foobar at 0x20"
);
//assert_current_logical_size(&tline, Lsn(0x40));
Ok(())
}
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> {
let mut lsn = start_lsn;
#[allow(non_snake_case)]
{
let writer = tline.writer();
// Create a relation on the timeline
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
}
tline.checkpoint(CheckpointConfig::Forced)?;
{
let writer = tline.writer();
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
}
tline.checkpoint(CheckpointConfig::Forced)
}
#[test]
fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
let repo =
RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
// and compaction works. But it does set the 'cutoff' point so that the cross check
// below should fail.
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
// try to branch at lsn 25, should fail because we already garbage collected the data
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
Ok(_) => panic!("branching should have failed"),
Err(err) => {
assert!(err.to_string().contains("invalid branch start lsn"));
assert!(err
.source()
.unwrap()
.to_string()
.contains("we might've already garbage collected needed data"))
}
}
Ok(())
}
#[test]
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
Ok(_) => panic!("branching should have failed"),
Err(err) => {
assert!(&err.to_string().contains("invalid branch start lsn"));
assert!(&err
.source()
.unwrap()
.to_string()
.contains("is earlier than latest GC horizon"));
}
}
Ok(())
}
/*
// FIXME: This currently fails to error out. Calling GC doesn't currently
// remove the old value, we'd need to work a little harder
#[test]
fn test_prohibit_get_for_garbage_collected_data() -> Result<()> {
let repo =
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
match tline.get(*TEST_KEY, Lsn(0x25)) {
Ok(_) => panic!("request for page should have failed"),
Err(err) => assert!(err.to_string().contains("not found at")),
}
Ok(())
}
*/
#[test]
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
let repo =
RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
Ok(())
}
#[test]
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
// run gc on parent
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
// Check that the data is still accessible on the branch.
assert_eq!(
newtline.get(*TEST_KEY, Lsn(0x50))?,
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
);
Ok(())
}
#[test]
fn timeline_load() -> Result<()> {
const TEST_NAME: &str = "timeline_load";
let harness = RepoHarness::create(TEST_NAME)?;
{
let repo = harness.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
tline.checkpoint(CheckpointConfig::Forced)?;
}
let repo = harness.load();
let tline = repo
.get_timeline(TIMELINE_ID)
.expect("cannot load timeline");
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
assert!(repo.get_timeline_load(TIMELINE_ID).is_ok());
let tline = repo
.get_timeline(TIMELINE_ID)
.expect("cannot load timeline");
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
Ok(())
}
#[test]
fn timeline_load_with_ancestor() -> Result<()> {
const TEST_NAME: &str = "timeline_load_with_ancestor";
let harness = RepoHarness::create(TEST_NAME)?;
// create two timelines
{
let repo = harness.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
tline.checkpoint(CheckpointConfig::Forced)?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
tline.checkpoint(CheckpointConfig::Forced)?;
}
// check that both of them are initially unloaded
let repo = harness.load();
{
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
let tline = repo
.get_timeline(NEW_TIMELINE_ID)
.expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
}
// load only child timeline
let _ = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("cannot load timeline");
// check that both, child and ancestor are loaded
let tline = repo
.get_timeline(NEW_TIMELINE_ID)
.expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
Ok(())
}
#[test]
fn corrupt_metadata() -> Result<()> {
const TEST_NAME: &str = "corrupt_metadata";
@@ -940,22 +1468,13 @@ pub mod tests {
Ok(())
}
// Target file size in the unit tests. In production, the target
// file size is much larger, maybe 1 GB. But a small size makes it
// much faster to exercise all the logic for creating the files,
// garbage collection, compaction etc.
pub const TEST_FILE_SIZE: u64 = 4 * 1024 * 1024;
#[test]
fn test_images() -> Result<()> {
let repo = RepoHarness::create("test_images")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
#[allow(non_snake_case)]
let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
let writer = tline.writer();
writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
writer.finish_write(Lsn(0x10));
drop(writer);
@@ -963,7 +1482,7 @@ pub mod tests {
tline.compact()?;
let writer = tline.writer();
writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
writer.finish_write(Lsn(0x20));
drop(writer);
@@ -971,7 +1490,7 @@ pub mod tests {
tline.compact()?;
let writer = tline.writer();
writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
writer.finish_write(Lsn(0x30));
drop(writer);
@@ -979,18 +1498,18 @@ pub mod tests {
tline.compact()?;
let writer = tline.writer();
writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
writer.finish_write(Lsn(0x40));
drop(writer);
tline.checkpoint(CheckpointConfig::Forced)?;
tline.compact()?;
assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
assert_eq!(tline.get(TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
assert_eq!(tline.get(TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
Ok(())
}

View File

@@ -157,7 +157,14 @@ where
// Look up the right page
let cache = page_cache::get();
loop {
match cache.read_immutable_buf(self.file_id, blknum) {
match cache
.read_immutable_buf(self.file_id, blknum)
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::Other,
format!("Failed to read immutable buf: {e:#}"),
)
})? {
ReadBufResult::Found(guard) => break Ok(guard),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer

View File

@@ -12,7 +12,7 @@ use once_cell::sync::Lazy;
use std::cmp::min;
use std::collections::HashMap;
use std::fs::OpenOptions;
use std::io::{Error, ErrorKind};
use std::io::{self, ErrorKind};
use std::ops::DerefMut;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
@@ -51,7 +51,7 @@ impl EphemeralFile {
conf: &PageServerConf,
tenantid: ZTenantId,
timelineid: ZTimelineId,
) -> Result<EphemeralFile, std::io::Error> {
) -> Result<EphemeralFile, io::Error> {
let mut l = EPHEMERAL_FILES.write().unwrap();
let file_id = l.next_file_id;
l.next_file_id += 1;
@@ -76,7 +76,7 @@ impl EphemeralFile {
})
}
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
let mut off = 0;
while off < PAGE_SZ {
let n = self
@@ -96,10 +96,13 @@ impl EphemeralFile {
Ok(())
}
fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, Error> {
fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
// Look up the right page
let cache = page_cache::get();
let mut write_guard = match cache.write_ephemeral_buf(self.file_id, blkno) {
let mut write_guard = match cache
.write_ephemeral_buf(self.file_id, blkno)
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
{
WriteBufResult::Found(guard) => guard,
WriteBufResult::NotFound(mut guard) => {
// Read the page from disk into the buffer
@@ -127,7 +130,7 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
}
impl FileExt for EphemeralFile {
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
// Look up the right page
let blkno = (offset / PAGE_SZ as u64) as u32;
let off = offset as usize % PAGE_SZ;
@@ -137,7 +140,10 @@ impl FileExt for EphemeralFile {
let mut write_guard;
let cache = page_cache::get();
let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
let buf = match cache
.read_ephemeral_buf(self.file_id, blkno)
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
{
ReadBufResult::Found(guard) => {
read_guard = guard;
read_guard.as_ref()
@@ -158,7 +164,7 @@ impl FileExt for EphemeralFile {
Ok(len)
}
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
// Look up the right page
let blkno = (offset / PAGE_SZ as u64) as u32;
let off = offset as usize % PAGE_SZ;
@@ -166,7 +172,10 @@ impl FileExt for EphemeralFile {
let mut write_guard;
let cache = page_cache::get();
let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
let buf = match cache
.write_ephemeral_buf(self.file_id, blkno)
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
{
WriteBufResult::Found(guard) => {
write_guard = guard;
write_guard.deref_mut()
@@ -190,7 +199,7 @@ impl FileExt for EphemeralFile {
}
impl BlobWriter for EphemeralFile {
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
let pos = self.size;
let mut blknum = (self.size / PAGE_SZ as u64) as u32;
@@ -268,11 +277,11 @@ impl Drop for EphemeralFile {
}
}
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
Ok(_) => Ok(()),
Err(e) => Err(std::io::Error::new(
Err(e) => Err(io::Error::new(
ErrorKind::Other,
format!(
"failed to write back to ephemeral file at {} error: {}",
@@ -282,7 +291,7 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er
)),
}
} else {
Err(std::io::Error::new(
Err(io::Error::new(
ErrorKind::Other,
"could not write back page, not found in ephemeral files hash",
))
@@ -292,11 +301,14 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er
impl BlockReader for EphemeralFile {
type BlockLease = page_cache::PageReadGuard<'static>;
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
// Look up the right page
let cache = page_cache::get();
loop {
match cache.read_ephemeral_buf(self.file_id, blknum) {
match cache
.read_ephemeral_buf(self.file_id, blknum)
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
{
ReadBufResult::Found(guard) => return Ok(guard),
ReadBufResult::NotFound(mut write_guard) => {
// Read the page from disk into the buffer
@@ -311,6 +323,10 @@ impl BlockReader for EphemeralFile {
}
}
fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
}
#[cfg(test)]
mod tests {
use super::*;
@@ -322,7 +338,7 @@ mod tests {
fn repo_harness(
test_name: &str,
) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> {
let repo_dir = PageServerConf::test_repo_dir(test_name);
let _ = fs::remove_dir_all(&repo_dir);
let conf = PageServerConf::dummy_conf(repo_dir);
@@ -339,7 +355,7 @@ mod tests {
// Helper function to slurp contents of a file, starting at the current position,
// into a string
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
let mut buf = Vec::new();
buf.resize(len, 0u8);
@@ -351,7 +367,7 @@ mod tests {
}
#[test]
fn test_ephemeral_files() -> Result<(), Error> {
fn test_ephemeral_files() -> Result<(), io::Error> {
let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
let file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
@@ -382,7 +398,7 @@ mod tests {
}
#[test]
fn test_ephemeral_blobs() -> Result<(), Error> {
fn test_ephemeral_blobs() -> Result<(), io::Error> {
let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?;
let mut file = EphemeralFile::create(conf, tenantid, timelineid)?;

View File

@@ -10,7 +10,7 @@ use std::path::PathBuf;
use utils::lsn::Lsn;
// Note: LayeredTimeline::load_layer_map() relies on this sort order
// Note: Timeline::load_layer_map() relies on this sort order
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct DeltaFileName {
pub key_range: Range<Key>,

View File

@@ -1,4 +1,4 @@
//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
//! Every image of a certain timeline from [`crate::layered_repository::Repository`]
//! has a metadata that needs to be stored persistently.
//!
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
@@ -6,10 +6,13 @@
//!
//! The module contains all structs and related helper methods related to timeline metadata.
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::path::PathBuf;
use anyhow::ensure;
use anyhow::{bail, ensure, Context};
use serde::{Deserialize, Serialize};
use tracing::info_span;
use utils::{
bin_ser::BeSer,
lsn::Lsn,
@@ -17,6 +20,7 @@ use utils::{
};
use crate::config::PageServerConf;
use crate::virtual_file::VirtualFile;
use crate::STORAGE_FORMAT_VERSION;
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
@@ -30,7 +34,7 @@ pub const METADATA_FILE_NAME: &str = "metadata";
/// Metadata stored on disk for each timeline
///
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
/// The fields correspond to the values we hold in memory, in Timeline.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TimelineMetadata {
hdr: TimelineMetadataHeader,
@@ -65,17 +69,6 @@ struct TimelineMetadataBody {
initdb_lsn: Lsn,
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
) -> PathBuf {
conf.timeline_path(&timelineid, &tenantid)
.join(METADATA_FILE_NAME)
}
impl TimelineMetadata {
pub fn new(
disk_consistent_lsn: Lsn,
@@ -173,11 +166,57 @@ impl TimelineMetadata {
}
}
/// Points to a place in pageserver's local directory,
/// where certain timeline's metadata file should be located.
pub fn metadata_path(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
) -> PathBuf {
conf.timeline_path(&timelineid, &tenantid)
.join(METADATA_FILE_NAME)
}
/// Save timeline metadata to file
pub fn save_metadata(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
data: &TimelineMetadata,
first_save: bool,
) -> anyhow::Result<()> {
let _enter = info_span!("saving metadata").entered();
let path = metadata_path(conf, timelineid, tenantid);
// use OpenOptions to ensure file presence is consistent with first_save
let mut file = VirtualFile::open_with_options(
&path,
OpenOptions::new().write(true).create_new(first_save),
)?;
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
if file.write(&metadata_bytes)? != metadata_bytes.len() {
bail!("Could not write all the metadata bytes in a single call");
}
file.sync_all()?;
// fsync the parent directory to ensure the directory entry is durable
if first_save {
let timeline_dir = File::open(
&path
.parent()
.expect("Metadata should always have a parent dir"),
)?;
timeline_dir.sync_all()?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use crate::repository::repo_harness::TIMELINE_ID;
use super::*;
use crate::layered_repository::repo_harness::TIMELINE_ID;
#[test]
fn metadata_serializes_correctly() {

View File

@@ -9,14 +9,12 @@ use once_cell::sync::Lazy;
use tracing::*;
use std::cmp::{max, min, Ordering};
use std::collections::{hash_map::Entry, HashMap, HashSet};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::fs::{File, OpenOptions};
use std::io::Write;
use std::ops::{Deref, Range};
use std::path::PathBuf;
use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError};
use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
use std::time::{Duration, Instant, SystemTime};
use metrics::{
@@ -32,7 +30,7 @@ use crate::layered_repository::{
image_layer::{ImageLayer, ImageLayerWriter},
inmemory_layer::InMemoryLayer,
layer_map::{LayerMap, SearchResult},
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME},
par_fsync,
storage_layer::{Layer, ValueReconstructResult, ValueReconstructState},
};
@@ -43,19 +41,18 @@ use crate::pgdatadir_mapping::BlockNumber;
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::reltag::RelTag;
use crate::tenant_config::TenantConfOpt;
use crate::DatadirTimeline;
use postgres_ffi::v14::xlog_utils::to_pg_timestamp;
use utils::{
lsn::{AtomicLsn, Lsn, RecordLsn},
seqwait::SeqWait,
simple_rcu::{Rcu, RcuReadGuard},
zid::{ZTenantId, ZTimelineId},
};
use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter};
use crate::repository::{GcResult, RepositoryTimeline};
use crate::repository::{Key, Value};
use crate::thread_mgr;
use crate::virtual_file::VirtualFile;
use crate::walreceiver::IS_WAL_RECEIVER;
use crate::walredo::WalRedoManager;
use crate::CheckpointConfig;
@@ -140,6 +137,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
static CURRENT_LOGICAL_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
register_int_gauge_vec!(
"pageserver_current_logical_size",
"Current logical size grouped by timeline",
&["tenant_id", "timeline_id"]
)
.expect("failed to define a metric")
});
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
// or in testing they estimate how much we would upload if we did.
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
@@ -160,7 +166,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
#[derive(Clone)]
pub enum LayeredTimelineEntry {
Loaded(Arc<LayeredTimeline>),
Loaded(Arc<Timeline>),
Unloaded {
id: ZTimelineId,
metadata: TimelineMetadata,
@@ -191,7 +197,7 @@ impl LayeredTimelineEntry {
}
}
fn ensure_loaded(&self) -> anyhow::Result<&Arc<LayeredTimeline>> {
fn ensure_loaded(&self) -> anyhow::Result<&Arc<Timeline>> {
match self {
LayeredTimelineEntry::Loaded(timeline) => Ok(timeline),
LayeredTimelineEntry::Unloaded { .. } => {
@@ -213,7 +219,7 @@ impl LayeredTimelineEntry {
}
}
impl From<LayeredTimelineEntry> for RepositoryTimeline<LayeredTimeline> {
impl From<LayeredTimelineEntry> for RepositoryTimeline<Timeline> {
fn from(entry: LayeredTimelineEntry) -> Self {
match entry {
LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _),
@@ -235,6 +241,8 @@ struct TimelineMetrics {
pub last_record_gauge: IntGauge,
pub wait_lsn_time_histo: Histogram,
pub current_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
pub current_logical_size_gauge: IntGauge,
}
impl TimelineMetrics {
@@ -272,6 +280,9 @@ impl TimelineMetrics {
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
.unwrap();
TimelineMetrics {
reconstruct_time_histo,
@@ -284,11 +295,12 @@ impl TimelineMetrics {
last_record_gauge,
wait_lsn_time_histo,
current_physical_size_gauge,
current_logical_size_gauge,
}
}
}
pub struct LayeredTimeline {
pub struct Timeline {
conf: &'static PageServerConf,
tenant_conf: Arc<RwLock<TenantConfOpt>>,
@@ -340,8 +352,8 @@ pub struct LayeredTimeline {
upload_layers: AtomicBool,
/// Ensures layers aren't frozen by checkpointer between
/// [`LayeredTimeline::get_layer_for_write`] and layer reads.
/// Locked automatically by [`LayeredTimelineWriter`] and checkpointer.
/// [`Timeline::get_layer_for_write`] and layer reads.
/// Locked automatically by [`TimelineWriter`] and checkpointer.
/// Must always be acquired before the layer map/individual layer lock
/// to avoid deadlock.
write_lock: Mutex<()>,
@@ -351,12 +363,12 @@ pub struct LayeredTimeline {
/// Layer removal lock.
/// A lock to ensure that no layer of the timeline is removed concurrently by other threads.
/// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`],
/// and [`LayeredRepository::delete_timeline`].
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
/// and [`Repository::delete_timeline`].
layer_removal_cs: Mutex<()>,
// Needed to ensure that we can't create a branch at a point that was already garbage collected
pub latest_gc_cutoff_lsn: RwLock<Lsn>,
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
// List of child timelines and their branch points. This is needed to avoid
// garbage collecting data that is still needed by the child timelines.
@@ -377,7 +389,32 @@ pub struct LayeredTimeline {
repartition_threshold: u64,
/// Current logical size of the "datadir", at the last LSN.
current_logical_size: AtomicIsize,
///
/// Size shouldn't ever be negative, but this is signed for two reasons:
///
/// 1. If we initialized the "baseline" size lazily, while we already
/// process incoming WAL, the incoming WAL records could decrement the
/// variable and temporarily make it negative. (This is just future-proofing;
/// the initialization is currently not done lazily.)
///
/// 2. If there is a bug and we e.g. forget to increment it in some cases
/// when size grows, but remember to decrement it when it shrinks again, the
/// variable could go negative. In that case, it seems better to at least
/// try to keep tracking it, rather than clamp or overflow it. Note that
/// get_current_logical_size() will clamp the returned value to zero if it's
/// negative, and log an error. Could set it permanently to zero or some
/// special value to indicate "broken" instead, but this will do for now.
///
/// Note that we also expose a copy of this value as a prometheus metric,
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
/// and `set_current_logical_size` functions to modify this, they will
/// also keep the prometheus metric in sync.
current_logical_size: AtomicI64,
// TODO we don't have a good, API to ensure on a compilation level
// that the timeline passes all initialization.
// Hence we ensure that we init at least once for every timeline
// and keep this flag to avoid potentually long recomputes.
logical_size_initialized: AtomicBool,
/// Information about the last processed message by the WAL receiver,
/// or None if WAL receiver has not received anything for this timeline
@@ -385,7 +422,7 @@ pub struct LayeredTimeline {
pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
/// Relation size cache
rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
}
pub struct WalReceiverInfo {
@@ -394,46 +431,6 @@ pub struct WalReceiverInfo {
pub last_received_msg_ts: u128,
}
/// Inherit all the functions from DatadirTimeline, to provide the
/// functionality to store PostgreSQL relations, SLRUs, etc. in a
/// LayeredTimeline.
impl DatadirTimeline for LayeredTimeline {
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
if lsn >= *cached_lsn {
return Some(*nblocks);
}
}
None
}
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
match rel_size_cache.entry(tag) {
Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
}
}
}
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.insert(tag, (lsn, nblocks));
}
fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.remove(tag);
}
}
///
/// Information about how much history needs to be retained, needed by
/// Garbage Collection.
@@ -464,45 +461,37 @@ pub struct GcInfo {
}
/// Public interface functions
impl Timeline for LayeredTimeline {
fn get_ancestor_lsn(&self) -> Lsn {
impl Timeline {
//------------------------------------------------------------------------------
// Public GET functions
//------------------------------------------------------------------------------
/// Get the LSN where this branch was created
pub fn get_ancestor_lsn(&self) -> Lsn {
self.ancestor_lsn
}
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId> {
/// Get the ancestor's timeline id
pub fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId> {
self.ancestor_timeline
.as_ref()
.map(LayeredTimelineEntry::timeline_id)
}
/// Wait until WAL has been received up to the given LSN.
fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
// This should never be called from the WAL receiver thread, because that could lead
// to a deadlock.
ensure!(
!IS_WAL_RECEIVER.with(|c| c.get()),
"wait_lsn called by WAL receiver thread"
);
self.metrics.wait_lsn_time_histo.observe_closure_duration(
|| self.last_record_lsn
.wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
.with_context(|| {
format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
)
}))?;
Ok(())
/// Lock and get timeline's GC cuttof
pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read()
}
fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn> {
self.latest_gc_cutoff_lsn.read().unwrap()
}
/// Look up the value with the given a key
fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes> {
/// Look up given page version.
///
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction
/// above this needs to store suitable metadata to track what data exists with
/// what keys, in separate metadata entries. If a non-existent key is requested,
/// the Repository implementation may incorrectly return a value from an ancestor
/// branch, for example, or waste a lot of cycles chasing the non-existing key.
///
pub fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes> {
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
// The cached image can be returned directly if there is no WAL between the cached image
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
@@ -531,68 +520,31 @@ impl Timeline for LayeredTimeline {
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
}
/// Public entry point for checkpoint(). All the logic is in the private
/// checkpoint_internal function, this public facade just wraps it for
/// metrics collection.
fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
match cconf {
CheckpointConfig::Flush => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers(true)
}
CheckpointConfig::Forced => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers(true)?;
self.compact()
}
}
}
///
/// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn.
///
fn check_lsn_is_in_scope(
&self,
lsn: Lsn,
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
) -> Result<()> {
ensure!(
lsn >= **latest_gc_cutoff_lsn,
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
lsn,
**latest_gc_cutoff_lsn,
);
Ok(())
}
fn get_last_record_lsn(&self) -> Lsn {
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
pub fn get_last_record_lsn(&self) -> Lsn {
self.last_record_lsn.load().last
}
fn get_prev_record_lsn(&self) -> Lsn {
pub fn get_prev_record_lsn(&self) -> Lsn {
self.last_record_lsn.load().prev
}
fn get_last_record_rlsn(&self) -> RecordLsn {
/// Atomically get both last and prev.
pub fn get_last_record_rlsn(&self) -> RecordLsn {
self.last_record_lsn.load()
}
fn get_disk_consistent_lsn(&self) -> Lsn {
pub fn get_disk_consistent_lsn(&self) -> Lsn {
self.disk_consistent_lsn.load()
}
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a> {
Box::new(LayeredTimelineWriter {
tl: self,
_write_guard: self.write_lock.lock().unwrap(),
})
}
fn get_physical_size(&self) -> u64 {
/// Get the physical size of the timeline at the latest LSN
pub fn get_physical_size(&self) -> u64 {
self.metrics.current_physical_size_gauge.get()
}
fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
/// Get the physical size of the timeline at the latest LSN non incrementally
pub fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
// total size of layer files in the current timeline directory
let mut total_physical_size = 0;
@@ -611,9 +563,88 @@ impl Timeline for LayeredTimeline {
Ok(total_physical_size)
}
///
/// Wait until WAL has been received and processed up to this LSN.
///
/// You should call this before any of the other get_* or list_* functions. Calling
/// those functions with an LSN that has been processed yet is an error.
///
pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
// This should never be called from the WAL receiver thread, because that could lead
// to a deadlock.
ensure!(
!IS_WAL_RECEIVER.with(|c| c.get()),
"wait_lsn called by WAL receiver thread"
);
self.metrics.wait_lsn_time_histo.observe_closure_duration(
|| self.last_record_lsn
.wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
.with_context(|| {
format!(
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
)
}))?;
Ok(())
}
/// Check that it is valid to request operations with that lsn.
pub fn check_lsn_is_in_scope(
&self,
lsn: Lsn,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
) -> Result<()> {
ensure!(
lsn >= **latest_gc_cutoff_lsn,
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
lsn,
**latest_gc_cutoff_lsn,
);
Ok(())
}
//------------------------------------------------------------------------------
// Public PUT functions, to update the repository with new page versions.
//
// These are called by the WAL receiver to digest WAL records.
//------------------------------------------------------------------------------
/// Flush to disk all data that was written with the put_* functions
///
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
/// know anything about them here in the repository.
pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
match cconf {
CheckpointConfig::Flush => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers(true)
}
CheckpointConfig::Forced => {
self.freeze_inmem_layer(false);
self.flush_frozen_layers(true)?;
self.compact()
}
}
}
/// Mutate the timeline with a [`TimelineWriter`].
///
/// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
/// is a generic type in this trait. But that doesn't currently work in
/// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
pub fn writer(&self) -> TimelineWriter<'_> {
TimelineWriter {
tl: self,
_write_guard: self.write_lock.lock().unwrap(),
}
}
}
impl LayeredTimeline {
// Private functions
impl Timeline {
fn get_checkpoint_distance(&self) -> u64 {
let tenant_conf = self.tenant_conf.read().unwrap();
tenant_conf
@@ -662,8 +693,8 @@ impl LayeredTimeline {
tenant_id: ZTenantId,
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
upload_layers: bool,
) -> LayeredTimeline {
let mut result = LayeredTimeline {
) -> Timeline {
let mut result = Timeline {
conf,
tenant_conf,
timeline_id,
@@ -699,10 +730,11 @@ impl LayeredTimeline {
pitr_cutoff: Lsn(0),
}),
latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()),
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
initdb_lsn: metadata.initdb_lsn(),
current_logical_size: AtomicIsize::new(0),
current_logical_size: AtomicI64::new(0),
logical_size_initialized: AtomicBool::new(false),
partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
repartition_threshold: 0,
@@ -807,6 +839,10 @@ impl LayeredTimeline {
///
/// This can be a slow operation.
pub fn init_logical_size(&self) -> Result<()> {
if self.logical_size_initialized.load(AtomicOrdering::Acquire) {
return Ok(());
}
// Try a fast-path first:
// Copy logical size from ancestor timeline if there has been no changes on this
// branch, and no changes on the ancestor branch since the branch point.
@@ -819,8 +855,7 @@ impl LayeredTimeline {
//
// Logical size 0 means that it was not initialized, so don't believe that.
if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn {
self.current_logical_size
.store(ancestor_logical_size as isize, AtomicOrdering::SeqCst);
self.set_current_logical_size(ancestor_logical_size);
debug!(
"logical size copied from ancestor: {}",
ancestor_logical_size
@@ -834,8 +869,7 @@ impl LayeredTimeline {
// Have to calculate it the hard way
let last_lsn = self.get_last_record_lsn();
let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?;
self.current_logical_size
.store(logical_size as isize, AtomicOrdering::SeqCst);
self.set_current_logical_size(logical_size);
debug!("calculated logical size the hard way: {}", logical_size);
timer.stop_and_record();
@@ -844,10 +878,10 @@ impl LayeredTimeline {
/// Retrieve current logical size of the timeline
///
/// NOTE: counted incrementally, includes ancestors,
pub fn get_current_logical_size(&self) -> usize {
/// NOTE: counted incrementally, includes ancestors.
pub fn get_current_logical_size(&self) -> u64 {
let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire);
match usize::try_from(current_logical_size) {
match u64::try_from(current_logical_size) {
Ok(sz) => sz,
Err(_) => {
error!(
@@ -859,6 +893,36 @@ impl LayeredTimeline {
}
}
/// Update current logical size, adding `delta' to the old value.
fn update_current_logical_size(&self, delta: i64) {
let new_size = self
.current_logical_size
.fetch_add(delta, AtomicOrdering::SeqCst);
// Also set the value in the prometheus gauge. Note that
// there is a race condition here: if this is is called by two
// threads concurrently, the prometheus gauge might be set to
// one value while current_logical_size is set to the
// other. Currently, only initialization and the WAL receiver
// updates the logical size, and they don't run concurrently,
// so it cannot happen. And even if it did, it wouldn't be
// very serious, the metrics would just be slightly off until
// the next update.
self.metrics.current_logical_size_gauge.set(new_size);
}
/// Set current logical size.
fn set_current_logical_size(&self, new_size: u64) {
self.current_logical_size
.store(new_size as i64, AtomicOrdering::SeqCst);
self.logical_size_initialized
.store(true, AtomicOrdering::SeqCst);
// Also set the value in the prometheus gauge. Same race condition
// here as in `update_current_logical_size`.
self.metrics.current_logical_size_gauge.set(new_size as i64);
}
///
/// Get a handle to a Layer for reading.
///
@@ -1014,7 +1078,7 @@ impl LayeredTimeline {
Some((lsn, img))
}
fn get_ancestor_timeline(&self) -> Result<Arc<LayeredTimeline>> {
fn get_ancestor_timeline(&self) -> Result<Arc<Timeline>> {
let ancestor = self
.ancestor_timeline
.as_ref()
@@ -1135,7 +1199,7 @@ impl LayeredTimeline {
/// Also flush after a period of time without new data -- it helps
/// safekeepers to regard pageserver as caught up and suspend activity.
///
pub fn check_checkpoint_distance(self: &Arc<LayeredTimeline>) -> Result<()> {
pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> Result<()> {
let last_lsn = self.get_last_record_lsn();
let layers = self.layers.read().unwrap();
if let Some(open_layer) = &layers.open_layer {
@@ -1314,7 +1378,7 @@ impl LayeredTimeline {
ondisk_prev_record_lsn,
ancestor_timelineid,
self.ancestor_lsn,
*self.latest_gc_cutoff_lsn.read().unwrap(),
*self.latest_gc_cutoff_lsn.read(),
self.initdb_lsn,
);
@@ -1969,9 +2033,21 @@ impl LayeredTimeline {
let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered();
// We need to ensure that no one branches at a point before latest_gc_cutoff_lsn.
// See branch_timeline() for details.
*self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff;
// We need to ensure that no one tries to read page versions or create
// branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
// for details. This will block until the old value is no longer in use.
//
// The GC cutoff should only ever move forwards.
{
let write_guard = self.latest_gc_cutoff_lsn.write();
ensure!(
*write_guard <= new_gc_cutoff,
"Cannot move GC cutoff LSN backwards (was {}, new {})",
*write_guard,
new_gc_cutoff
);
write_guard.store(new_gc_cutoff);
}
info!("GC starting");
@@ -2117,7 +2193,7 @@ impl LayeredTimeline {
key: Key,
request_lsn: Lsn,
mut data: ValueReconstructState,
) -> Result<Bytes> {
) -> anyhow::Result<Bytes> {
// Perform WAL redo if needed
data.records.reverse();
@@ -2167,13 +2243,15 @@ impl LayeredTimeline {
if img.len() == page_cache::PAGE_SZ {
let cache = page_cache::get();
cache.memorize_materialized_page(
self.tenant_id,
self.timeline_id,
key,
last_rec_lsn,
&img,
);
cache
.memorize_materialized_page(
self.tenant_id,
self.timeline_id,
key,
last_rec_lsn,
&img,
)
.context("Materialized page memoization failed")?;
}
Ok(img)
@@ -2208,39 +2286,50 @@ fn layer_traversal_error(
Err(msg_iter.fold(err, |err, msg| err.context(msg)))
}
struct LayeredTimelineWriter<'a> {
tl: &'a LayeredTimeline,
/// Various functions to mutate the timeline.
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
// This is probably considered a bad practice in Rust and should be fixed eventually,
// but will cause large code changes.
pub struct TimelineWriter<'a> {
tl: &'a Timeline,
_write_guard: MutexGuard<'a, ()>,
}
impl Deref for LayeredTimelineWriter<'_> {
type Target = dyn Timeline;
impl Deref for TimelineWriter<'_> {
type Target = Timeline;
fn deref(&self) -> &Self::Target {
self.tl
}
}
impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> {
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
impl<'a> TimelineWriter<'a> {
/// Put a new page version that can be constructed from a WAL record
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
self.tl.put_value(key, lsn, value)
}
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
self.tl.put_tombstone(key_range, lsn)
}
///
/// Track the end of the latest digested WAL record.
/// Remember the (end of) last valid WAL record remembered in the timeline.
///
fn finish_write(&self, new_lsn: Lsn) {
/// Call this after you have finished writing all the WAL up to 'lsn'.
///
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
/// the latest and can be read.
pub fn finish_write(&self, new_lsn: Lsn) {
self.tl.finish_write(new_lsn);
}
fn update_current_logical_size(&self, delta: isize) {
self.tl
.current_logical_size
.fetch_add(delta, AtomicOrdering::SeqCst);
pub fn update_current_logical_size(&self, delta: i64) {
self.tl.update_current_logical_size(delta)
}
}
@@ -2263,39 +2352,3 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
bail!("couldn't find an unused backup number for {:?}", path)
}
/// Save timeline metadata to file
pub fn save_metadata(
conf: &'static PageServerConf,
timelineid: ZTimelineId,
tenantid: ZTenantId,
data: &TimelineMetadata,
first_save: bool,
) -> Result<()> {
let _enter = info_span!("saving metadata").entered();
let path = metadata_path(conf, timelineid, tenantid);
// use OpenOptions to ensure file presence is consistent with first_save
let mut file = VirtualFile::open_with_options(
&path,
OpenOptions::new().write(true).create_new(first_save),
)?;
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
if file.write(&metadata_bytes)? != metadata_bytes.len() {
bail!("Could not write all the metadata bytes in a single call");
}
file.sync_all()?;
// fsync the parent directory to ensure the directory entry is durable
if first_save {
let timeline_dir = File::open(
&path
.parent()
.expect("Metadata should always have a parent dir"),
)?;
timeline_dir.sync_all()?;
}
Ok(())
}

View File

@@ -28,8 +28,6 @@ use tracing::info;
use crate::thread_mgr::ThreadKind;
use metrics::{register_int_gauge_vec, IntGaugeVec};
use pgdatadir_mapping::DatadirTimeline;
/// Current storage format version
///
/// This is embedded in the metadata file, and also in the header of all the

View File

@@ -45,6 +45,7 @@ use std::{
},
};
use anyhow::Context;
use once_cell::sync::OnceCell;
use tracing::error;
use utils::{
@@ -342,7 +343,7 @@ impl PageCache {
key: Key,
lsn: Lsn,
img: &[u8],
) {
) -> anyhow::Result<()> {
let cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_id,
@@ -352,7 +353,7 @@ impl PageCache {
lsn,
};
match self.lock_for_write(&cache_key) {
match self.lock_for_write(&cache_key)? {
WriteBufResult::Found(write_guard) => {
// We already had it in cache. Another thread must've put it there
// concurrently. Check that it had the same contents that we
@@ -364,17 +365,19 @@ impl PageCache {
write_guard.mark_valid();
}
}
Ok(())
}
// Section 1.2: Public interface functions for working with Ephemeral pages.
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
self.lock_for_read(&mut cache_key)
}
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
self.lock_for_write(&cache_key)
@@ -402,7 +405,7 @@ impl PageCache {
// Section 1.3: Public interface functions for working with immutable file pages.
pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
self.lock_for_read(&mut cache_key)
@@ -495,15 +498,16 @@ impl PageCache {
/// }
/// ```
///
fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
loop {
// First check if the key already exists in the cache.
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
return ReadBufResult::Found(read_guard);
return Ok(ReadBufResult::Found(read_guard));
}
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self.find_victim();
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -526,10 +530,10 @@ impl PageCache {
inner.dirty = false;
slot.usage_count.store(1, Ordering::Relaxed);
return ReadBufResult::NotFound(PageWriteGuard {
return Ok(ReadBufResult::NotFound(PageWriteGuard {
inner,
valid: false,
});
}));
}
}
@@ -556,15 +560,16 @@ impl PageCache {
///
/// Similar to lock_for_read(), but the returned buffer is write-locked and
/// may be modified by the caller even if it's already found in the cache.
fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
loop {
// First check if the key already exists in the cache.
if let Some(write_guard) = self.try_lock_for_write(cache_key) {
return WriteBufResult::Found(write_guard);
return Ok(WriteBufResult::Found(write_guard));
}
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self.find_victim();
let (slot_idx, mut inner) =
self.find_victim().context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
@@ -587,10 +592,10 @@ impl PageCache {
inner.dirty = false;
slot.usage_count.store(1, Ordering::Relaxed);
return WriteBufResult::NotFound(PageWriteGuard {
return Ok(WriteBufResult::NotFound(PageWriteGuard {
inner,
valid: false,
});
}));
}
}
@@ -754,7 +759,7 @@ impl PageCache {
/// Find a slot to evict.
///
/// On return, the slot is empty and write-locked.
fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
let iter_limit = self.slots.len() * 10;
let mut iters = 0;
loop {
@@ -767,7 +772,7 @@ impl PageCache {
let mut inner = match slot.inner.try_write() {
Ok(inner) => inner,
Err(TryLockError::Poisoned(err)) => {
panic!("buffer lock was poisoned: {:?}", err)
anyhow::bail!("buffer lock was poisoned: {err:?}")
}
Err(TryLockError::WouldBlock) => {
// If we have looped through the whole buffer pool 10 times
@@ -777,7 +782,7 @@ impl PageCache {
// there are buffers in the pool. In practice, with a reasonably
// large buffer pool it really shouldn't happen.
if iters > iter_limit {
panic!("could not find a victim buffer to evict");
anyhow::bail!("exceeded evict iter limit");
}
continue;
}
@@ -804,7 +809,7 @@ impl PageCache {
inner.dirty = false;
inner.key = None;
}
return (slot_idx, inner);
return Ok((slot_idx, inner));
}
}
}

View File

@@ -17,24 +17,24 @@ use std::io::{self, Read};
use std::net::TcpListener;
use std::str;
use std::str::FromStr;
use std::sync::{Arc, RwLockReadGuard};
use std::sync::Arc;
use tracing::*;
use utils::{
auth::{self, Claims, JwtAuth, Scope},
lsn::Lsn,
postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend},
pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC},
simple_rcu::RcuReadGuard,
zid::{ZTenantId, ZTimelineId},
};
use crate::basebackup;
use crate::config::{PageServerConf, ProfilingConfig};
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
use crate::layered_repository::Timeline;
use crate::pgdatadir_mapping::LsnForTimestamp;
use crate::profiling::profpoint_start;
use crate::reltag::RelTag;
use crate::repository::Repository;
use crate::repository::Timeline;
use crate::tenant_mgr;
use crate::thread_mgr;
use crate::thread_mgr::ThreadKind;
@@ -495,22 +495,22 @@ impl PageServerHandler {
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id])
.observe_closure_duration(|| {
self.handle_get_rel_exists_request(timeline.as_ref(), &req)
self.handle_get_rel_exists_request(&timeline, &req)
}),
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
.with_label_values(&["get_rel_size", &tenant_id, &timeline_id])
.observe_closure_duration(|| {
self.handle_get_nblocks_request(timeline.as_ref(), &req)
self.handle_get_nblocks_request(&timeline, &req)
}),
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id])
.observe_closure_duration(|| {
self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
self.handle_get_page_at_lsn_request(&timeline, &req)
}),
PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME
.with_label_values(&["get_db_size", &tenant_id, &timeline_id])
.observe_closure_duration(|| {
self.handle_db_size_request(timeline.as_ref(), &req)
self.handle_db_size_request(&timeline, &req)
}),
};
@@ -636,11 +636,11 @@ impl PageServerHandler {
/// In either case, if the page server hasn't received the WAL up to the
/// requested LSN yet, we will wait for it to arrive. The return value is
/// the LSN that should be used to look up the page versions.
fn wait_or_get_last_lsn<T: DatadirTimeline>(
timeline: &T,
fn wait_or_get_last_lsn(
timeline: &Timeline,
mut lsn: Lsn,
latest: bool,
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
) -> Result<Lsn> {
if latest {
// Latest page version was requested. If LSN is given, it is a hint
@@ -684,9 +684,9 @@ impl PageServerHandler {
Ok(lsn)
}
fn handle_get_rel_exists_request<T: DatadirTimeline>(
fn handle_get_rel_exists_request(
&self,
timeline: &T,
timeline: &Timeline,
req: &PagestreamExistsRequest,
) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
@@ -701,9 +701,9 @@ impl PageServerHandler {
}))
}
fn handle_get_nblocks_request<T: DatadirTimeline>(
fn handle_get_nblocks_request(
&self,
timeline: &T,
timeline: &Timeline,
req: &PagestreamNblocksRequest,
) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
@@ -717,9 +717,9 @@ impl PageServerHandler {
}))
}
fn handle_db_size_request<T: DatadirTimeline>(
fn handle_db_size_request(
&self,
timeline: &T,
timeline: &Timeline,
req: &PagestreamDbSizeRequest,
) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered();
@@ -735,9 +735,9 @@ impl PageServerHandler {
}))
}
fn handle_get_page_at_lsn_request<T: DatadirTimeline>(
fn handle_get_page_at_lsn_request(
&self,
timeline: &T,
timeline: &Timeline,
req: &PagestreamGetPageRequest,
) -> Result<PagestreamBeMessage> {
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
@@ -745,7 +745,7 @@ impl PageServerHandler {
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
/*
// Add a 1s delay to some requests. The delayed causes the requests to
// Add a 1s delay to some requests. The delay helps the requests to
// hit the race condition from github issue #1047 more easily.
use rand::Rng;
if rand::thread_rng().gen::<u8>() < 5 {
@@ -1077,7 +1077,7 @@ impl postgres_backend::Handler for PageServerHandler {
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("do_gc ") {
// Run GC immediately on given timeline.
// FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
// This probably should require special authentication or a global flag to
// enable, I don't think we want to or need to allow regular clients to invoke
// GC.

View File

@@ -7,8 +7,8 @@
//! Clarify that)
//!
use crate::keyspace::{KeySpace, KeySpaceAccum};
use crate::layered_repository::Timeline;
use crate::reltag::{RelTag, SlruKind};
use crate::repository::Timeline;
use crate::repository::*;
use crate::walrecord::ZenithWalRecord;
use anyhow::{bail, ensure, Result};
@@ -18,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::TimestampTz;
use postgres_ffi::BLCKSZ;
use postgres_ffi::{Oid, TransactionId};
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::collections::{hash_map, HashMap, HashSet};
use std::ops::Range;
use tracing::{debug, trace, warn};
use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -35,23 +35,13 @@ pub enum LsnForTimestamp {
}
///
/// This trait provides all the functionality to store PostgreSQL relations, SLRUs,
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
/// and other special kinds of files, in a versioned key-value store. The
/// Timeline trait provides the key-value store.
/// Timeline struct provides the key-value store.
///
/// This is a trait, so that we can easily include all these functions in a Timeline
/// implementation. You're not expected to have different implementations of this trait,
/// rather, this provides an interface and implementation, over Timeline.
///
/// If you wanted to store other kinds of data in the Neon repository, e.g.
/// flat files or MySQL, you would create a new trait like this, with all the
/// functions that make sense for the kind of data you're storing. For flat files,
/// for example, you might have a function like "fn read(path, offset, size)".
/// We might also have that situation in the future, to support multiple PostgreSQL
/// versions, if there are big changes in how the data is organized in the data
/// directory, or if new special files are introduced.
///
pub trait DatadirTimeline: Timeline {
/// This is a separate impl, so that we can easily include all these functions in a Timeline
/// implementation, and might be moved into a separate struct later.
impl Timeline {
/// Start ingesting a WAL record, or other atomic modification of
/// the timeline.
///
@@ -75,7 +65,7 @@ pub trait DatadirTimeline: Timeline {
/// functions of the timeline until you finish! And if you update the
/// same page twice, the last update wins.
///
fn begin_modification(&self, lsn: Lsn) -> DatadirModification<Self>
pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification
where
Self: Sized,
{
@@ -93,7 +83,7 @@ pub trait DatadirTimeline: Timeline {
//------------------------------------------------------------------------------
/// Look up given page version.
fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
ensure!(tag.relnode != 0, "invalid relnode");
let nblocks = self.get_rel_size(tag, lsn)?;
@@ -110,7 +100,7 @@ pub trait DatadirTimeline: Timeline {
}
// Get size of a database in blocks
fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
let mut total_blocks = 0;
let rels = self.list_rels(spcnode, dbnode, lsn)?;
@@ -123,7 +113,7 @@ pub trait DatadirTimeline: Timeline {
}
/// Get size of a relation file
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
ensure!(tag.relnode != 0, "invalid relnode");
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
@@ -151,7 +141,7 @@ pub trait DatadirTimeline: Timeline {
}
/// Does relation exist?
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
ensure!(tag.relnode != 0, "invalid relnode");
// first try to lookup relation in cache
@@ -169,7 +159,7 @@ pub trait DatadirTimeline: Timeline {
}
/// Get a list of all existing relations in given tablespace and database.
fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
// fetch directory listing
let key = rel_dir_to_key(spcnode, dbnode);
let buf = self.get(key, lsn)?;
@@ -187,7 +177,7 @@ pub trait DatadirTimeline: Timeline {
}
/// Look up given SLRU page version.
fn get_slru_page_at_lsn(
pub fn get_slru_page_at_lsn(
&self,
kind: SlruKind,
segno: u32,
@@ -199,14 +189,19 @@ pub trait DatadirTimeline: Timeline {
}
/// Get size of an SLRU segment
fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<BlockNumber> {
pub fn get_slru_segment_size(
&self,
kind: SlruKind,
segno: u32,
lsn: Lsn,
) -> Result<BlockNumber> {
let key = slru_segment_size_to_key(kind, segno);
let mut buf = self.get(key, lsn)?;
Ok(buf.get_u32_le())
}
/// Get size of an SLRU segment
fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
// fetch directory listing
let key = slru_dir_to_key(kind);
let buf = self.get(key, lsn)?;
@@ -223,7 +218,7 @@ pub trait DatadirTimeline: Timeline {
/// so it's not well defined which LSN you get if there were multiple commits
/// "in flight" at that point in time.
///
fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
let min_lsn = *gc_cutoff_lsn_guard;
let max_lsn = self.get_last_record_lsn();
@@ -286,7 +281,7 @@ pub trait DatadirTimeline: Timeline {
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
/// with a smaller/larger timestamp.
///
fn is_latest_commit_timestamp_ge_than(
pub fn is_latest_commit_timestamp_ge_than(
&self,
search_timestamp: TimestampTz,
probe_lsn: Lsn,
@@ -317,7 +312,7 @@ pub trait DatadirTimeline: Timeline {
}
/// Get a list of SLRU segments
fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
// fetch directory entry
let key = slru_dir_to_key(kind);
@@ -327,14 +322,14 @@ pub trait DatadirTimeline: Timeline {
Ok(dir.segments)
}
fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
let key = relmap_file_key(spcnode, dbnode);
let buf = self.get(key, lsn)?;
Ok(buf)
}
fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
// fetch directory entry
let buf = self.get(DBDIR_KEY, lsn)?;
let dir = DbDirectory::des(&buf)?;
@@ -342,13 +337,13 @@ pub trait DatadirTimeline: Timeline {
Ok(dir.dbdirs)
}
fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
let key = twophase_file_key(xid);
let buf = self.get(key, lsn)?;
Ok(buf)
}
fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
// fetch directory entry
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
let dir = TwoPhaseDirectory::des(&buf)?;
@@ -356,11 +351,11 @@ pub trait DatadirTimeline: Timeline {
Ok(dir.xids)
}
fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
self.get(CONTROLFILE_KEY, lsn)
}
fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
self.get(CHECKPOINT_KEY, lsn)
}
@@ -369,29 +364,29 @@ pub trait DatadirTimeline: Timeline {
///
/// Only relation blocks are counted currently. That excludes metadata,
/// SLRUs, twophase files etc.
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
// Fetch list of database dirs and iterate them
let buf = self.get(DBDIR_KEY, lsn)?;
let dbdir = DbDirectory::des(&buf)?;
let mut total_size: usize = 0;
let mut total_size: u64 = 0;
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
let relsize_key = rel_size_to_key(rel);
let mut buf = self.get(relsize_key, lsn)?;
let relsize = buf.get_u32_le();
total_size += relsize as usize;
total_size += relsize as u64;
}
}
Ok(total_size * BLCKSZ as usize)
Ok(total_size * BLCKSZ as u64)
}
///
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
/// Anything that's not listed maybe removed from the underlying storage (from
/// that LSN forwards).
fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
// Iterate through key ranges, greedily packing them into partitions
let mut result = KeySpaceAccum::new();
@@ -465,27 +460,54 @@ pub trait DatadirTimeline: Timeline {
}
/// Get cached size of relation if it not updated after specified LSN
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber>;
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
let rel_size_cache = self.rel_size_cache.read().unwrap();
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
if lsn >= *cached_lsn {
return Some(*nblocks);
}
}
None
}
/// Update cached relation size if there is no more recent update
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
match rel_size_cache.entry(tag) {
hash_map::Entry::Occupied(mut entry) => {
let cached_lsn = entry.get_mut();
if lsn >= cached_lsn.0 {
*cached_lsn = (lsn, nblocks);
}
}
hash_map::Entry::Vacant(entry) => {
entry.insert((lsn, nblocks));
}
}
}
/// Store cached relation size
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.insert(tag, (lsn, nblocks));
}
/// Remove cached relation size
fn remove_cached_rel_size(&self, tag: &RelTag);
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
rel_size_cache.remove(tag);
}
}
/// DatadirModification represents an operation to ingest an atomic set of
/// updates to the repository. It is created by the 'begin_record'
/// function. It is called for each WAL record, so that all the modifications
/// by a one WAL record appear atomic.
pub struct DatadirModification<'a, T: DatadirTimeline> {
pub struct DatadirModification<'a> {
/// The timeline this modification applies to. You can access this to
/// read the state, but note that any pending updates are *not* reflected
/// in the state in 'tline' yet.
pub tline: &'a T,
pub tline: &'a Timeline,
/// Lsn assigned by begin_modification
pub lsn: Lsn,
@@ -495,10 +517,10 @@ pub struct DatadirModification<'a, T: DatadirTimeline> {
// underlying key-value store by the 'finish' function.
pending_updates: HashMap<Key, Value>,
pending_deletions: Vec<Range<Key>>,
pending_nblocks: isize,
pending_nblocks: i64,
}
impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
impl<'a> DatadirModification<'a> {
/// Initialize a completely new repository.
///
/// This inserts the directory metadata entries that are assumed to
@@ -654,7 +676,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
}
// Update logical database size.
self.pending_nblocks -= total_blocks as isize;
self.pending_nblocks -= total_blocks as i64;
// Delete all relations and metadata files for the spcnode/dnode
self.delete(dbdir_key_range(spcnode, dbnode));
@@ -697,7 +719,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
let buf = nblocks.to_le_bytes();
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
self.pending_nblocks += nblocks as isize;
self.pending_nblocks += nblocks as i64;
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
@@ -727,7 +749,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
// Update logical database size.
self.pending_nblocks -= old_size as isize - nblocks as isize;
self.pending_nblocks -= old_size as i64 - nblocks as i64;
}
Ok(())
}
@@ -749,7 +771,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
// Update relation size cache
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
self.pending_nblocks += nblocks as isize - old_size as isize;
self.pending_nblocks += nblocks as i64 - old_size as i64;
}
Ok(())
}
@@ -772,7 +794,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
// update logical size
let size_key = rel_size_to_key(rel);
let old_size = self.get(size_key)?.get_u32_le();
self.pending_nblocks -= old_size as isize;
self.pending_nblocks -= old_size as i64;
// Remove enty from relation size cache
self.tline.remove_cached_rel_size(&rel);
@@ -914,7 +936,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
result?;
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize);
writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64);
self.pending_nblocks = 0;
}
@@ -942,7 +964,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
writer.finish_write(lsn);
if pending_nblocks != 0 {
writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize);
writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64);
}
Ok(())
@@ -1368,10 +1390,10 @@ fn is_slru_block_key(key: Key) -> bool {
//
#[cfg(test)]
pub fn create_test_timeline<R: Repository>(
repo: R,
pub fn create_test_timeline(
repo: &crate::layered_repository::Repository,
timeline_id: utils::zid::ZTimelineId,
) -> Result<std::sync::Arc<R::Timeline>> {
) -> Result<std::sync::Arc<Timeline>> {
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
let mut m = tline.begin_modification(Lsn(8));
m.init_empty()?;

View File

@@ -1,19 +1,13 @@
use crate::layered_repository::metadata::TimelineMetadata;
use crate::storage_sync::index::RemoteIndex;
use crate::walrecord::ZenithWalRecord;
use crate::CheckpointConfig;
use anyhow::{bail, Result};
use byteorder::{ByteOrder, BE};
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::ops::{AddAssign, Range};
use std::sync::{Arc, RwLockReadGuard};
use std::sync::Arc;
use std::time::Duration;
use utils::{
lsn::{Lsn, RecordLsn},
zid::ZTimelineId,
};
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
/// Key used in the Repository kv-store.
@@ -181,78 +175,6 @@ impl Value {
}
}
///
/// A repository corresponds to one .neon directory. One repository holds multiple
/// timelines, forked off from the same initial call to 'initdb'.
pub trait Repository: Send + Sync {
type Timeline: crate::DatadirTimeline;
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
/// See [`crate::remote_storage`] for more details about the synchronization.
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
/// Get Timeline handle for given zenith timeline ID.
/// This function is idempotent. It doesn't change internal state in any way.
fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Self::Timeline>>;
/// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded.
fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<Self::Timeline>>;
/// Lists timelines the repository contains.
/// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Self::Timeline>)>;
/// Create a new, empty timeline. The caller is responsible for loading data into it
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
fn create_empty_timeline(
&self,
timeline_id: ZTimelineId,
initdb_lsn: Lsn,
) -> Result<Arc<Self::Timeline>>;
/// Branch a timeline
fn branch_timeline(
&self,
src: ZTimelineId,
dst: ZTimelineId,
start_lsn: Option<Lsn>,
) -> Result<()>;
/// Flush all data to disk.
///
/// this is used at graceful shutdown.
fn checkpoint(&self) -> Result<()>;
/// perform one garbage collection iteration, removing old data files from disk.
/// this function is periodically called by gc thread.
/// also it can be explicitly requested through page server api 'do_gc' command.
///
/// 'timelineid' specifies the timeline to GC, or None for all.
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
/// to make tests more deterministic.
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
fn gc_iteration(
&self,
timelineid: Option<ZTimelineId>,
horizon: u64,
pitr: Duration,
checkpoint_before_gc: bool,
) -> Result<GcResult>;
/// Perform one compaction iteration.
/// This function is periodically called by compactor thread.
/// Also it can be explicitly requested per timeline through page server
/// api's 'compact' command.
fn compaction_iteration(&self) -> Result<()>;
/// removes timeline-related in-memory data
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>;
/// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
fn get_remote_index(&self) -> &RemoteIndex;
}
/// A timeline, that belongs to the current repository.
pub enum RepositoryTimeline<T> {
/// Timeline, with its files present locally in pageserver's working directory.
@@ -304,621 +226,3 @@ impl AddAssign for GcResult {
self.elapsed += other.elapsed;
}
}
pub trait Timeline: Send + Sync {
//------------------------------------------------------------------------------
// Public GET functions
//------------------------------------------------------------------------------
///
/// Wait until WAL has been received and processed up to this LSN.
///
/// You should call this before any of the other get_* or list_* functions. Calling
/// those functions with an LSN that has been processed yet is an error.
///
fn wait_lsn(&self, lsn: Lsn) -> Result<()>;
/// Lock and get timeline's GC cuttof
fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn>;
/// Look up given page version.
///
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction
/// above this needs to store suitable metadata to track what data exists with
/// what keys, in separate metadata entries. If a non-existent key is requested,
/// the Repository implementation may incorrectly return a value from an ancestor
/// branch, for example, or waste a lot of cycles chasing the non-existing key.
///
fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes>;
/// Get the ancestor's timeline id
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId>;
/// Get the LSN where this branch was created
fn get_ancestor_lsn(&self) -> Lsn;
//------------------------------------------------------------------------------
// Public PUT functions, to update the repository with new page versions.
//
// These are called by the WAL receiver to digest WAL records.
//------------------------------------------------------------------------------
/// Atomically get both last and prev.
fn get_last_record_rlsn(&self) -> RecordLsn;
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
fn get_last_record_lsn(&self) -> Lsn;
fn get_prev_record_lsn(&self) -> Lsn;
fn get_disk_consistent_lsn(&self) -> Lsn;
/// Mutate the timeline with a [`TimelineWriter`].
///
/// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
/// is a generic type in this trait. But that doesn't currently work in
/// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
///
/// Flush to disk all data that was written with the put_* functions
///
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
/// know anything about them here in the repository.
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
///
/// Check that it is valid to request operations with that lsn.
fn check_lsn_is_in_scope(
&self,
lsn: Lsn,
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
) -> Result<()>;
/// Get the physical size of the timeline at the latest LSN
fn get_physical_size(&self) -> u64;
/// Get the physical size of the timeline at the latest LSN non incrementally
fn get_physical_size_non_incremental(&self) -> Result<u64>;
}
/// Various functions to mutate the timeline.
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
// This is probably considered a bad practice in Rust and should be fixed eventually,
// but will cause large code changes.
pub trait TimelineWriter<'a> {
/// Put a new page version that can be constructed from a WAL record
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>;
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;
/// Track the end of the latest digested WAL record.
///
/// Call this after you have finished writing all the WAL up to 'lsn'.
///
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
/// the latest and can be read.
fn finish_write(&self, lsn: Lsn);
fn update_current_logical_size(&self, delta: isize);
}
#[cfg(test)]
pub mod repo_harness {
use bytes::BytesMut;
use once_cell::sync::Lazy;
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
use std::{fs, path::PathBuf};
use crate::{
config::PageServerConf,
layered_repository::LayeredRepository,
walredo::{WalRedoError, WalRedoManager},
};
use super::*;
use crate::tenant_config::{TenantConf, TenantConfOpt};
use hex_literal::hex;
use utils::zid::ZTenantId;
pub const TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
pub const NEW_TIMELINE_ID: ZTimelineId =
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
/// Convenience function to create a page image with given string as the only content
#[allow(non_snake_case)]
pub fn TEST_IMG(s: &str) -> Bytes {
let mut buf = BytesMut::new();
buf.extend_from_slice(s.as_bytes());
buf.resize(64, 0);
buf.freeze()
}
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
impl From<TenantConf> for TenantConfOpt {
fn from(tenant_conf: TenantConf) -> Self {
Self {
checkpoint_distance: Some(tenant_conf.checkpoint_distance),
checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
compaction_target_size: Some(tenant_conf.compaction_target_size),
compaction_period: Some(tenant_conf.compaction_period),
compaction_threshold: Some(tenant_conf.compaction_threshold),
gc_horizon: Some(tenant_conf.gc_horizon),
gc_period: Some(tenant_conf.gc_period),
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
pitr_interval: Some(tenant_conf.pitr_interval),
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
}
}
}
pub struct RepoHarness<'a> {
pub conf: &'static PageServerConf,
pub tenant_conf: TenantConf,
pub tenant_id: ZTenantId,
pub lock_guard: (
Option<RwLockReadGuard<'a, ()>>,
Option<RwLockWriteGuard<'a, ()>>,
),
}
impl<'a> RepoHarness<'a> {
pub fn create(test_name: &'static str) -> Result<Self> {
Self::create_internal(test_name, false)
}
pub fn create_exclusive(test_name: &'static str) -> Result<Self> {
Self::create_internal(test_name, true)
}
fn create_internal(test_name: &'static str, exclusive: bool) -> Result<Self> {
let lock_guard = if exclusive {
(None, Some(LOCK.write().unwrap()))
} else {
(Some(LOCK.read().unwrap()), None)
};
let repo_dir = PageServerConf::test_repo_dir(test_name);
let _ = fs::remove_dir_all(&repo_dir);
fs::create_dir_all(&repo_dir)?;
let conf = PageServerConf::dummy_conf(repo_dir);
// Make a static copy of the config. This can never be free'd, but that's
// OK in a test.
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
let tenant_conf = TenantConf::dummy_conf();
let tenant_id = ZTenantId::generate();
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
Ok(Self {
conf,
tenant_conf,
tenant_id,
lock_guard,
})
}
pub fn load(&self) -> LayeredRepository {
self.try_load().expect("failed to load test repo")
}
pub fn try_load(&self) -> Result<LayeredRepository> {
let walredo_mgr = Arc::new(TestRedoManager);
let repo = LayeredRepository::new(
self.conf,
TenantConfOpt::from(self.tenant_conf),
walredo_mgr,
self.tenant_id,
RemoteIndex::default(),
false,
);
// populate repo with locally available timelines
for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
.expect("should be able to read timelines dir")
{
let timeline_dir_entry = timeline_dir_entry.unwrap();
let timeline_id: ZTimelineId = timeline_dir_entry
.path()
.file_name()
.unwrap()
.to_string_lossy()
.parse()
.unwrap();
repo.attach_timeline(timeline_id)?;
}
Ok(repo)
}
pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
self.conf.timeline_path(timeline_id, &self.tenant_id)
}
}
// Mock WAL redo manager that doesn't do much
pub struct TestRedoManager;
impl WalRedoManager for TestRedoManager {
fn request_redo(
&self,
key: Key,
lsn: Lsn,
base_img: Option<Bytes>,
records: Vec<(Lsn, ZenithWalRecord)>,
) -> Result<Bytes, WalRedoError> {
let s = format!(
"redo for {} to get to {}, with {} and {} records",
key,
lsn,
if base_img.is_some() {
"base image"
} else {
"no base image"
},
records.len()
);
println!("{}", s);
Ok(TEST_IMG(&s))
}
}
}
///
/// Tests that should work the same with any Repository/Timeline implementation.
///
#[allow(clippy::bool_assert_comparison)]
#[cfg(test)]
mod tests {
use super::repo_harness::*;
use super::*;
//use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
//use std::sync::Arc;
use bytes::BytesMut;
use hex_literal::hex;
use once_cell::sync::Lazy;
static TEST_KEY: Lazy<Key> =
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
#[test]
fn test_basic() -> Result<()> {
let repo = RepoHarness::create("test_basic")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
writer.finish_write(Lsn(0x10));
drop(writer);
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
writer.finish_write(Lsn(0x20));
drop(writer);
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
Ok(())
}
#[test]
fn no_duplicate_timelines() -> Result<()> {
let repo = RepoHarness::create("no_duplicate_timelines")?.load();
let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
}
Ok(())
}
/// Convenience function to create a page image with given string as the only content
pub fn test_value(s: &str) -> Value {
let mut buf = BytesMut::new();
buf.extend_from_slice(s.as_bytes());
Value::Image(buf.freeze())
}
///
/// Test branch creation
///
#[test]
fn test_branch() -> Result<()> {
let repo = RepoHarness::create("test_branch")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
let writer = tline.writer();
use std::str::from_utf8;
#[allow(non_snake_case)]
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
#[allow(non_snake_case)]
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
// Insert a value on the timeline
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
writer.finish_write(Lsn(0x20));
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
writer.finish_write(Lsn(0x30));
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
writer.finish_write(Lsn(0x40));
//assert_current_logical_size(&tline, Lsn(0x40));
// Branch the history, modify relation differently on the new timeline
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
let new_writer = newtline.writer();
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
new_writer.finish_write(Lsn(0x40));
// Check page contents on both branches
assert_eq!(
from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
"foo at 0x40"
);
assert_eq!(
from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
"bar at 0x40"
);
assert_eq!(
from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
"foobar at 0x20"
);
//assert_current_logical_size(&tline, Lsn(0x40));
Ok(())
}
fn make_some_layers<T: Timeline>(tline: &T, start_lsn: Lsn) -> Result<()> {
let mut lsn = start_lsn;
#[allow(non_snake_case)]
{
let writer = tline.writer();
// Create a relation on the timeline
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
}
tline.checkpoint(CheckpointConfig::Forced)?;
{
let writer = tline.writer();
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
}
tline.checkpoint(CheckpointConfig::Forced)
}
#[test]
fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
let repo =
RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
// and compaction works. But it does set the 'cutoff' point so that the cross check
// below should fail.
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
// try to branch at lsn 25, should fail because we already garbage collected the data
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
Ok(_) => panic!("branching should have failed"),
Err(err) => {
assert!(err.to_string().contains("invalid branch start lsn"));
assert!(err
.source()
.unwrap()
.to_string()
.contains("we might've already garbage collected needed data"))
}
}
Ok(())
}
#[test]
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
Ok(_) => panic!("branching should have failed"),
Err(err) => {
assert!(&err.to_string().contains("invalid branch start lsn"));
assert!(&err
.source()
.unwrap()
.to_string()
.contains("is earlier than latest GC horizon"));
}
}
Ok(())
}
/*
// FIXME: This currently fails to error out. Calling GC doesn't currently
// remove the old value, we'd need to work a little harder
#[test]
fn test_prohibit_get_for_garbage_collected_data() -> Result<()> {
let repo =
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
match tline.get(*TEST_KEY, Lsn(0x25)) {
Ok(_) => panic!("request for page should have failed"),
Err(err) => assert!(err.to_string().contains("not found at")),
}
Ok(())
}
*/
#[test]
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
let repo =
RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
Ok(())
}
#[test]
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
// run gc on parent
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
// Check that the data is still accessible on the branch.
assert_eq!(
newtline.get(*TEST_KEY, Lsn(0x50))?,
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
);
Ok(())
}
#[test]
fn timeline_load() -> Result<()> {
const TEST_NAME: &str = "timeline_load";
let harness = RepoHarness::create(TEST_NAME)?;
{
let repo = harness.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
tline.checkpoint(CheckpointConfig::Forced)?;
}
let repo = harness.load();
let tline = repo
.get_timeline(TIMELINE_ID)
.expect("cannot load timeline");
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
assert!(repo.get_timeline_load(TIMELINE_ID).is_ok());
let tline = repo
.get_timeline(TIMELINE_ID)
.expect("cannot load timeline");
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
Ok(())
}
#[test]
fn timeline_load_with_ancestor() -> Result<()> {
const TEST_NAME: &str = "timeline_load_with_ancestor";
let harness = RepoHarness::create(TEST_NAME)?;
// create two timelines
{
let repo = harness.load();
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
tline.checkpoint(CheckpointConfig::Forced)?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
tline.checkpoint(CheckpointConfig::Forced)?;
}
// check that both of them are initially unloaded
let repo = harness.load();
{
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
let tline = repo
.get_timeline(NEW_TIMELINE_ID)
.expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
}
// load only child timeline
let _ = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("cannot load timeline");
// check that both, child and ancestor are loaded
let tline = repo
.get_timeline(NEW_TIMELINE_ID)
.expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
Ok(())
}
}

View File

@@ -156,7 +156,7 @@ use std::{
use anyhow::{anyhow, bail, Context};
use futures::stream::{FuturesUnordered, StreamExt};
use once_cell::sync::{Lazy, OnceCell};
use remote_storage::{GenericRemoteStorage, RemoteStorage};
use remote_storage::GenericRemoteStorage;
use tokio::{
fs,
runtime::Runtime,
@@ -253,36 +253,20 @@ pub struct SyncStartupData {
/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
pub fn start_local_timeline_sync(
config: &'static PageServerConf,
storage: Option<Arc<GenericRemoteStorage>>,
) -> anyhow::Result<SyncStartupData> {
let local_timeline_files = local_tenant_timeline_files(config)
.context("Failed to collect local tenant timeline files")?;
match config.remote_storage_config.as_ref() {
Some(storage_config) => {
match GenericRemoteStorage::new(config.workdir.clone(), storage_config)
.context("Failed to init the generic remote storage")?
{
GenericRemoteStorage::Local(local_fs_storage) => {
storage_sync::spawn_storage_sync_thread(
config,
local_timeline_files,
local_fs_storage,
storage_config.max_concurrent_syncs,
storage_config.max_sync_errors,
)
}
GenericRemoteStorage::S3(s3_bucket_storage) => {
storage_sync::spawn_storage_sync_thread(
config,
local_timeline_files,
s3_bucket_storage,
storage_config.max_concurrent_syncs,
storage_config.max_sync_errors,
)
}
}
.context("Failed to spawn the storage sync thread")
}
match storage.zip(config.remote_storage_config.as_ref()) {
Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread(
config,
local_timeline_files,
storage,
storage_config.max_concurrent_syncs,
storage_config.max_sync_errors,
)
.context("Failed to spawn the storage sync thread"),
None => {
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
@@ -810,17 +794,13 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {
/// Launch a thread to perform remote storage sync tasks.
/// See module docs for loop step description.
pub(super) fn spawn_storage_sync_thread<P, S>(
pub(super) fn spawn_storage_sync_thread(
conf: &'static PageServerConf,
local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
storage: S,
storage: Arc<GenericRemoteStorage>,
max_concurrent_timelines_sync: NonZeroUsize,
max_sync_errors: NonZeroU32,
) -> anyhow::Result<SyncStartupData>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> anyhow::Result<SyncStartupData> {
let sync_queue = SyncQueue::new(max_concurrent_timelines_sync);
SYNC_QUEUE
.set(sync_queue)
@@ -860,7 +840,7 @@ where
storage_sync_loop(
runtime,
conf,
(Arc::new(storage), remote_index_clone, sync_queue),
(storage, remote_index_clone, sync_queue),
max_sync_errors,
);
Ok(())
@@ -873,15 +853,12 @@ where
})
}
fn storage_sync_loop<P, S>(
fn storage_sync_loop(
runtime: Runtime,
conf: &'static PageServerConf,
(storage, index, sync_queue): (Arc<S>, RemoteIndex, &SyncQueue),
(storage, index, sync_queue): (Arc<GenericRemoteStorage>, RemoteIndex, &SyncQueue),
max_sync_errors: NonZeroU32,
) where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) {
info!("Starting remote storage sync loop");
loop {
let loop_storage = Arc::clone(&storage);
@@ -983,18 +960,14 @@ enum UploadStatus {
Nothing,
}
async fn process_batches<P, S>(
async fn process_batches(
conf: &'static PageServerConf,
max_sync_errors: NonZeroU32,
storage: Arc<S>,
storage: Arc<GenericRemoteStorage>,
index: &RemoteIndex,
batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
sync_queue: &SyncQueue,
) -> HashSet<ZTenantId>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> HashSet<ZTenantId> {
let mut sync_results = batched_tasks
.into_iter()
.map(|(sync_id, batch)| {
@@ -1030,17 +1003,13 @@ where
downloaded_timelines
}
async fn process_sync_task_batch<P, S>(
async fn process_sync_task_batch(
conf: &'static PageServerConf,
(storage, index, sync_queue): (Arc<S>, RemoteIndex, &SyncQueue),
(storage, index, sync_queue): (Arc<GenericRemoteStorage>, RemoteIndex, &SyncQueue),
max_sync_errors: NonZeroU32,
sync_id: ZTenantTimelineId,
batch: SyncTaskBatch,
) -> DownloadStatus
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> DownloadStatus {
let sync_start = Instant::now();
let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() };
@@ -1175,19 +1144,15 @@ where
download_status
}
async fn download_timeline_data<P, S>(
async fn download_timeline_data(
conf: &'static PageServerConf,
(storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
current_remote_timeline: Option<&RemoteTimeline>,
sync_id: ZTenantTimelineId,
new_download_data: SyncData<LayersDownload>,
sync_start: Instant,
task_name: &str,
) -> DownloadStatus
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> DownloadStatus {
match download_timeline_layers(
conf,
storage,
@@ -1298,17 +1263,14 @@ async fn update_local_metadata(
Ok(())
}
async fn delete_timeline_data<P, S>(
async fn delete_timeline_data(
conf: &'static PageServerConf,
(storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
sync_id: ZTenantTimelineId,
mut new_delete_data: SyncData<LayersDeletion>,
sync_start: Instant,
task_name: &str,
) where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) {
let timeline_delete = &mut new_delete_data.data;
if !timeline_delete.deletion_registered {
@@ -1343,19 +1305,15 @@ async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMeta
.context("Failed to parse metadata bytes")
}
async fn upload_timeline_data<P, S>(
async fn upload_timeline_data(
conf: &'static PageServerConf,
(storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
current_remote_timeline: Option<&RemoteTimeline>,
sync_id: ZTenantTimelineId,
new_upload_data: SyncData<LayersUpload>,
sync_start: Instant,
task_name: &str,
) -> UploadStatus
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> UploadStatus {
let mut uploaded_data = match upload_timeline_layers(
storage,
sync_queue,
@@ -1406,17 +1364,13 @@ enum RemoteDataUpdate<'a> {
Delete(&'a HashSet<PathBuf>),
}
async fn update_remote_data<P, S>(
async fn update_remote_data(
conf: &'static PageServerConf,
storage: &S,
storage: &GenericRemoteStorage,
index: &RemoteIndex,
sync_id: ZTenantTimelineId,
update: RemoteDataUpdate<'_>,
) -> anyhow::Result<()>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> anyhow::Result<()> {
let updated_remote_timeline = {
let mut index_accessor = index.write().await;
@@ -1642,7 +1596,7 @@ fn register_sync_status(
mod test_utils {
use utils::lsn::Lsn;
use crate::repository::repo_harness::RepoHarness;
use crate::layered_repository::repo_harness::RepoHarness;
use super::*;
@@ -1687,7 +1641,7 @@ mod test_utils {
#[cfg(test)]
mod tests {
use super::test_utils::dummy_metadata;
use crate::repository::repo_harness::TIMELINE_ID;
use crate::layered_repository::repo_harness::TIMELINE_ID;
use hex_literal::hex;
use utils::lsn::Lsn;

View File

@@ -1,27 +1,25 @@
//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage.
use std::path::Path;
use anyhow::Context;
use futures::stream::{FuturesUnordered, StreamExt};
use tracing::{debug, error, info};
use crate::storage_sync::{SyncQueue, SyncTask};
use remote_storage::RemoteStorage;
use remote_storage::{GenericRemoteStorage, RemoteStorage};
use utils::zid::ZTenantTimelineId;
use super::{LayersDeletion, SyncData};
/// Attempts to remove the timleline layers from the remote storage.
/// If the task had not adjusted the metadata before, the deletion will fail.
pub(super) async fn delete_timeline_layers<'a, P, S>(
storage: &'a S,
pub(super) async fn delete_timeline_layers<'a>(
storage: &'a GenericRemoteStorage,
sync_queue: &SyncQueue,
sync_id: ZTenantTimelineId,
mut delete_data: SyncData<LayersDeletion>,
) -> bool
where
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> bool {
if !delete_data.data.deletion_registered {
error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing");
delete_data.retries += 1;
@@ -45,25 +43,14 @@ where
let mut delete_tasks = layers_to_delete
.into_iter()
.map(|local_layer_path| async {
let storage_path =
match storage
.remote_object_id(&local_layer_path)
.with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
local_layer_path.display()
)
}) {
Ok(path) => path,
Err(e) => return Err((e, local_layer_path)),
};
match storage.delete(&storage_path).await.with_context(|| {
format!(
"Failed to delete remote layer from storage at '{:?}'",
storage_path
)
}) {
match match storage {
GenericRemoteStorage::Local(storage) => {
remove_storage_object(storage, &local_layer_path).await
}
GenericRemoteStorage::S3(storage) => {
remove_storage_object(storage, &local_layer_path).await
}
} {
Ok(()) => Ok(local_layer_path),
Err(e) => Err((e, local_layer_path)),
}
@@ -101,6 +88,28 @@ where
errored
}
async fn remove_storage_object<P, S>(storage: &S, local_layer_path: &Path) -> anyhow::Result<()>
where
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
let storage_path = storage
.remote_object_id(local_layer_path)
.with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
local_layer_path.display()
)
})?;
storage.delete(&storage_path).await.with_context(|| {
format!(
"Failed to delete remote layer from storage at '{:?}'",
storage_path
)
})
}
#[cfg(test)]
mod tests {
use std::{collections::HashSet, num::NonZeroUsize};
@@ -111,10 +120,10 @@ mod tests {
use utils::lsn::Lsn;
use crate::{
repository::repo_harness::{RepoHarness, TIMELINE_ID},
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
storage_sync::test_utils::{create_local_timeline, dummy_metadata},
};
use remote_storage::LocalFs;
use remote_storage::{LocalFs, RemoteStorage};
use super::*;
@@ -123,10 +132,10 @@ mod tests {
let harness = RepoHarness::create("delete_timeline_negative")?;
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let storage = LocalFs::new(
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_path_buf(),
harness.conf.workdir.clone(),
)?;
)?);
let deleted = delete_timeline_layers(
&storage,
@@ -158,17 +167,20 @@ mod tests {
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let layer_files = ["a", "b", "c", "d"];
let storage = LocalFs::new(
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_path_buf(),
harness.conf.workdir.clone(),
)?;
)?);
let local_storage = storage.as_local().unwrap();
let current_retries = 3;
let metadata = dummy_metadata(Lsn(0x30));
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
let timeline_upload =
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
for local_path in timeline_upload.layers_to_upload {
let remote_path = storage.remote_object_id(&local_path)?;
let remote_path = local_storage.remote_object_id(&local_path)?;
let remote_parent_dir = remote_path.parent().unwrap();
if !remote_parent_dir.exists() {
fs::create_dir_all(&remote_parent_dir).await?;
@@ -176,11 +188,11 @@ mod tests {
fs::copy(&local_path, &remote_path).await?;
}
assert_eq!(
storage
local_storage
.list()
.await?
.into_iter()
.map(|remote_path| storage.local_path(&remote_path).unwrap())
.map(|remote_path| local_storage.local_path(&remote_path).unwrap())
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
.sorted()
.collect::<Vec<_>>(),
@@ -213,11 +225,11 @@ mod tests {
assert!(deleted, "Should be able to delete timeline files");
assert_eq!(
storage
local_storage
.list()
.await?
.into_iter()
.map(|remote_path| storage.local_path(&remote_path).unwrap())
.map(|remote_path| local_storage.local_path(&remote_path).unwrap())
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
.sorted()
.collect::<Vec<_>>(),

View File

@@ -9,7 +9,9 @@ use std::{
use anyhow::Context;
use futures::stream::{FuturesUnordered, StreamExt};
use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage};
use remote_storage::{
path_with_suffix_extension, Download, DownloadError, GenericRemoteStorage, RemoteStorage,
};
use tokio::{
fs,
io::{self, AsyncWriteExt},
@@ -62,15 +64,11 @@ impl Default for TenantIndexParts {
}
}
pub async fn download_index_parts<P, S>(
pub async fn download_index_parts(
conf: &'static PageServerConf,
storage: &S,
storage: &GenericRemoteStorage,
keys: HashSet<ZTenantTimelineId>,
) -> HashMap<ZTenantId, TenantIndexParts>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> HashMap<ZTenantId, TenantIndexParts> {
let mut index_parts: HashMap<ZTenantId, TenantIndexParts> = HashMap::new();
let mut part_downloads = keys
@@ -114,60 +112,17 @@ where
/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests.
/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines).
/// And then will attempt to download all index files that belong to these timelines.
pub async fn gather_tenant_timelines_index_parts<P, S>(
pub async fn gather_tenant_timelines_index_parts(
conf: &'static PageServerConf,
storage: &S,
storage: &GenericRemoteStorage,
tenant_id: ZTenantId,
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>>
where
P: RemoteObjectName + Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>> {
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
format!(
"Failed to get tenant storage path for local path '{}'",
tenant_path.display()
)
})?;
let timelines = storage
.list_prefixes(Some(tenant_storage_path))
let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id)
.await
.with_context(|| {
format!(
"Failed to list tenant storage path to get remote timelines to download: {}",
tenant_id
)
})?;
.with_context(|| format!("Failed to list timeline sync ids for tenat {tenant_id}"))?;
if timelines.is_empty() {
anyhow::bail!(
"no timelines found on the remote storage for tenant {}",
tenant_id
)
}
let mut sync_ids = HashSet::new();
for timeline_remote_storage_key in timelines {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
let timeline_id: ZTimelineId = object_name
.parse()
.with_context(|| {
format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'")
})?;
sync_ids.insert(ZTenantTimelineId {
tenant_id,
timeline_id,
});
}
match download_index_parts(conf, storage, sync_ids)
match download_index_parts(conf, storage, timeline_sync_ids)
.await
.remove(&tenant_id)
.ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))?
@@ -180,29 +135,15 @@ where
}
/// Retrieves index data from the remote storage for a given timeline.
async fn download_index_part<P, S>(
async fn download_index_part(
conf: &'static PageServerConf,
storage: &S,
storage: &GenericRemoteStorage,
sync_id: ZTenantTimelineId,
) -> Result<IndexPart, DownloadError>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> Result<IndexPart, DownloadError> {
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
.with_file_name(IndexPart::FILE_NAME)
.with_extension(IndexPart::FILE_EXTENSION);
let part_storage_path = storage
.remote_object_id(&index_part_path)
.with_context(|| {
format!(
"Failed to get the index part storage path for local path '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::BadInput)?;
let mut index_part_download = storage.download(&part_storage_path).await?;
let mut index_part_download = download_storage_object(storage, &index_part_path).await?;
let mut index_part_bytes = Vec::new();
io::copy(
@@ -211,14 +152,18 @@ where
)
.await
.with_context(|| {
format!("Failed to download an index part from storage path {part_storage_path:?}")
format!(
"Failed to download an index part into file '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::Other)?;
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
.with_context(|| {
format!(
"Failed to deserialize index part file from storage path '{part_storage_path:?}'"
"Failed to deserialize index part file into file '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::Other)?;
@@ -249,18 +194,14 @@ pub(super) enum DownloadedTimeline {
/// updated in the end, if the remote one contains a newer disk_consistent_lsn.
///
/// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task.
pub(super) async fn download_timeline_layers<'a, P, S>(
pub(super) async fn download_timeline_layers<'a>(
conf: &'static PageServerConf,
storage: &'a S,
storage: &'a GenericRemoteStorage,
sync_queue: &'a SyncQueue,
remote_timeline: Option<&'a RemoteTimeline>,
sync_id: ZTenantTimelineId,
mut download_data: SyncData<LayersDownload>,
) -> DownloadedTimeline
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> DownloadedTimeline {
let remote_timeline = match remote_timeline {
Some(remote_timeline) => {
if !remote_timeline.awaits_download {
@@ -300,15 +241,6 @@ where
layer_desination_path.display()
);
} else {
let layer_storage_path = storage
.remote_object_id(&layer_desination_path)
.with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
layer_desination_path.display()
)
})?;
// Perform a rename inspired by durable_rename from file_utils.c.
// The sequence:
// write(tmp)
@@ -329,19 +261,23 @@ where
temp_file_path.display()
)
})?;
let mut download = storage
.download(&layer_storage_path)
let mut layer_download = download_storage_object(storage, &layer_desination_path)
.await
.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
"Failed to initiate the download the layer for {sync_id} into file '{}'",
temp_file_path.display()
)
})?;
io::copy(&mut layer_download.download_stream, &mut destination_file)
.await
.with_context(|| {
format!(
"Failed to download the layer for {sync_id} into file '{}'",
temp_file_path.display()
)
})?;
io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
format!(
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
)
})?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
// A file will not be closed immediately when it goes out of scope if there are any IO operations
@@ -429,6 +365,121 @@ where
}
}
async fn download_storage_object(
storage: &GenericRemoteStorage,
to_path: &Path,
) -> Result<Download, DownloadError> {
async fn do_download_storage_object<P, S>(
storage: &S,
to_path: &Path,
) -> Result<Download, DownloadError>
where
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
let remote_object_path = storage
.remote_object_id(to_path)
.with_context(|| {
format!(
"Failed to get the storage path for target local path '{}'",
to_path.display()
)
})
.map_err(DownloadError::BadInput)?;
storage.download(&remote_object_path).await
}
match storage {
GenericRemoteStorage::Local(storage) => do_download_storage_object(storage, to_path).await,
GenericRemoteStorage::S3(storage) => do_download_storage_object(storage, to_path).await,
}
}
async fn get_timeline_sync_ids(
storage: &GenericRemoteStorage,
tenant_path: &Path,
tenant_id: ZTenantId,
) -> anyhow::Result<HashSet<ZTenantTimelineId>> {
let timeline_ids: Vec<ZTimelineId> = match storage {
GenericRemoteStorage::Local(storage) => list_prefixes(storage, tenant_path)
.await?
.into_iter()
.map(|timeline_directory_path| {
timeline_directory_path
.file_stem()
.with_context(|| {
format!(
"Failed to get timeline id string from file '{}'",
timeline_directory_path.display()
)
})?
.to_string_lossy()
.as_ref()
.parse()
.with_context(|| {
format!(
"failed to parse directory name '{}' as timeline id",
timeline_directory_path.display()
)
})
})
.collect::<anyhow::Result<_>>(),
GenericRemoteStorage::S3(storage) => list_prefixes(storage, tenant_path)
.await?
.into_iter()
.map(|s3_path| {
s3_path
.object_name()
.with_context(|| {
format!("Failed to get object name out of S3 path {s3_path:?}")
})?
.parse()
.with_context(|| {
format!("failed to parse object name '{s3_path:?}' as timeline id")
})
})
.collect::<anyhow::Result<_>>(),
}
.with_context(|| {
format!("Tenant {tenant_id} has at least one incorrect timeline subdirectory")
})?;
if timeline_ids.is_empty() {
anyhow::bail!("no timelines found on the remote storage for tenant {tenant_id}")
}
Ok(timeline_ids
.into_iter()
.map(|timeline_id| ZTenantTimelineId {
tenant_id,
timeline_id,
})
.collect())
}
async fn list_prefixes<P, S>(storage: &S, tenant_path: &Path) -> anyhow::Result<Vec<P>>
where
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| {
format!(
"Failed to get tenant storage path for local path '{}'",
tenant_path.display()
)
})?;
storage
.list_prefixes(Some(&tenant_storage_path))
.await
.with_context(|| {
format!(
"Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
)
})
}
async fn fsync_path(path: impl AsRef<Path>) -> Result<(), io::Error> {
fs::File::open(path).await?.sync_all().await
}
@@ -445,7 +496,7 @@ mod tests {
use utils::lsn::Lsn;
use crate::{
repository::repo_harness::{RepoHarness, TIMELINE_ID},
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
storage_sync::{
index::RelativePath,
test_utils::{create_local_timeline, dummy_metadata},
@@ -461,10 +512,11 @@ mod tests {
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
let storage = LocalFs::new(
tempdir()?.path().to_path_buf(),
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_owned(),
harness.conf.workdir.clone(),
)?;
)?);
let local_storage = storage.as_local().unwrap();
let current_retries = 3;
let metadata = dummy_metadata(Lsn(0x30));
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
@@ -472,7 +524,7 @@ mod tests {
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
for local_path in timeline_upload.layers_to_upload {
let remote_path = storage.remote_object_id(&local_path)?;
let remote_path = local_storage.remote_object_id(&local_path)?;
let remote_parent_dir = remote_path.parent().unwrap();
if !remote_parent_dir.exists() {
fs::create_dir_all(&remote_parent_dir).await?;
@@ -558,7 +610,10 @@ mod tests {
let harness = RepoHarness::create("download_timeline_negatives")?;
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_owned(),
harness.conf.workdir.clone(),
)?);
let empty_remote_timeline_download = download_timeline_layers(
harness.conf,
@@ -614,10 +669,11 @@ mod tests {
let harness = RepoHarness::create("test_download_index_part")?;
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let storage = LocalFs::new(
tempdir()?.path().to_path_buf(),
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_owned(),
harness.conf.workdir.clone(),
)?;
)?);
let local_storage = storage.as_local().unwrap();
let metadata = dummy_metadata(Lsn(0x30));
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
@@ -638,7 +694,7 @@ mod tests {
metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id)
.with_file_name(IndexPart::FILE_NAME)
.with_extension(IndexPart::FILE_EXTENSION);
let storage_path = storage.remote_object_id(&local_index_part_path)?;
let storage_path = local_storage.remote_object_id(&local_index_part_path)?;
fs::create_dir_all(storage_path.parent().unwrap()).await?;
fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?;

View File

@@ -210,7 +210,7 @@ impl RemoteTimelineIndex {
}
/// Restored index part data about the timeline, stored in the remote index.
#[derive(Debug, PartialEq, Eq, Clone)]
#[derive(Debug, Clone)]
pub struct RemoteTimeline {
timeline_layers: HashSet<PathBuf>,
missing_layers: HashSet<PathBuf>,
@@ -341,7 +341,7 @@ mod tests {
use std::collections::BTreeSet;
use super::*;
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID};
#[test]
fn index_part_conversion() {

View File

@@ -1,11 +1,14 @@
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
use std::{fmt::Debug, path::PathBuf};
use std::{
fmt::Debug,
path::{Path, PathBuf},
};
use anyhow::Context;
use futures::stream::{FuturesUnordered, StreamExt};
use once_cell::sync::Lazy;
use remote_storage::RemoteStorage;
use remote_storage::{GenericRemoteStorage, RemoteStorage};
use tokio::fs;
use tracing::{debug, error, info, warn};
@@ -30,16 +33,12 @@ static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
});
/// Serializes and uploads the given index part data to the remote storage.
pub(super) async fn upload_index_part<P, S>(
pub(super) async fn upload_index_part(
conf: &'static PageServerConf,
storage: &S,
storage: &GenericRemoteStorage,
sync_id: ZTenantTimelineId,
index_part: IndexPart,
) -> anyhow::Result<()>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> anyhow::Result<()> {
let index_part_bytes = serde_json::to_vec(&index_part)
.context("Failed to serialize index part file into bytes")?;
let index_part_size = index_part_bytes.len();
@@ -48,27 +47,9 @@ where
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
.with_file_name(IndexPart::FILE_NAME)
.with_extension(IndexPart::FILE_EXTENSION);
let index_part_storage_path =
storage
.remote_object_id(&index_part_path)
.with_context(|| {
format!(
"Failed to get the index part storage path for local path '{}'",
index_part_path.display()
)
})?;
storage
.upload(
index_part_bytes,
index_part_size,
&index_part_storage_path,
None,
)
upload_storage_object(storage, index_part_bytes, index_part_size, &index_part_path)
.await
.with_context(|| {
format!("Failed to upload index part to the storage path '{index_part_storage_path:?}'")
})
.with_context(|| format!("Failed to upload index part for '{sync_id}'"))
}
/// Timeline upload result, with extra data, needed for uploading.
@@ -84,17 +65,13 @@ pub(super) enum UploadedTimeline {
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
///
/// On an error, bumps the retries count and reschedules the entire task.
pub(super) async fn upload_timeline_layers<'a, P, S>(
storage: &'a S,
pub(super) async fn upload_timeline_layers<'a>(
storage: &'a GenericRemoteStorage,
sync_queue: &SyncQueue,
remote_timeline: Option<&'a RemoteTimeline>,
sync_id: ZTenantTimelineId,
mut upload_data: SyncData<LayersUpload>,
) -> UploadedTimeline
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
) -> UploadedTimeline {
let upload = &mut upload_data.data;
let new_upload_lsn = upload
.metadata
@@ -132,16 +109,6 @@ where
let mut upload_tasks = layers_to_upload
.into_iter()
.map(|source_path| async move {
let storage_path = storage
.remote_object_id(&source_path)
.with_context(|| {
format!(
"Failed to get the layer storage path for local path '{}'",
source_path.display()
)
})
.map_err(UploadError::Other)?;
let source_file = match fs::File::open(&source_path).await.with_context(|| {
format!(
"Failed to upen a source file for layer '{}'",
@@ -164,15 +131,10 @@ where
.map_err(UploadError::Other)?
.len() as usize;
match storage
.upload(source_file, source_size, &storage_path, None)
match upload_storage_object(storage, source_file, source_size, &source_path)
.await
.with_context(|| {
format!(
"Failed to upload a layer from local path '{}'",
source_path.display()
)
}) {
.with_context(|| format!("Failed to upload layer file for {sync_id}"))
{
Ok(()) => Ok(source_path),
Err(e) => Err(UploadError::MissingLocalFile(source_path, e)),
}
@@ -231,6 +193,51 @@ where
}
}
async fn upload_storage_object(
storage: &GenericRemoteStorage,
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
from_size_bytes: usize,
from_path: &Path,
) -> anyhow::Result<()> {
async fn do_upload_storage_object<P, S>(
storage: &S,
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
from_size_bytes: usize,
from_path: &Path,
) -> anyhow::Result<()>
where
P: std::fmt::Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
let target_storage_path = storage.remote_object_id(from_path).with_context(|| {
format!(
"Failed to get the storage path for source local path '{}'",
from_path.display()
)
})?;
storage
.upload(from, from_size_bytes, &target_storage_path, None)
.await
.with_context(|| {
format!(
"Failed to upload from '{}' to storage path '{:?}'",
from_path.display(),
target_storage_path
)
})
}
match storage {
GenericRemoteStorage::Local(storage) => {
do_upload_storage_object(storage, from, from_size_bytes, from_path).await
}
GenericRemoteStorage::S3(storage) => {
do_upload_storage_object(storage, from, from_size_bytes, from_path).await
}
}
}
enum UploadError {
MissingLocalFile(PathBuf, anyhow::Error),
Other(anyhow::Error),
@@ -243,12 +250,12 @@ mod tests {
num::NonZeroUsize,
};
use remote_storage::LocalFs;
use remote_storage::{LocalFs, RemoteStorage};
use tempfile::tempdir;
use utils::lsn::Lsn;
use crate::{
repository::repo_harness::{RepoHarness, TIMELINE_ID},
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
storage_sync::{
index::RelativePath,
test_utils::{create_local_timeline, dummy_metadata},
@@ -264,10 +271,11 @@ mod tests {
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let layer_files = ["a", "b"];
let storage = LocalFs::new(
tempdir()?.path().to_path_buf(),
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_owned(),
harness.conf.workdir.clone(),
)?;
)?);
let local_storage = storage.as_local().unwrap();
let current_retries = 3;
let metadata = dummy_metadata(Lsn(0x30));
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
@@ -276,7 +284,7 @@ mod tests {
timeline_upload.metadata = None;
assert!(
storage.list().await?.is_empty(),
local_storage.list().await?.is_empty(),
"Storage should be empty before any uploads are made"
);
@@ -322,7 +330,7 @@ mod tests {
"Successful upload without metadata should not have it returned either"
);
let storage_files = storage.list().await?;
let storage_files = local_storage.list().await?;
assert_eq!(
storage_files.len(),
layer_files.len(),
@@ -331,7 +339,7 @@ mod tests {
assert_eq!(
storage_files
.into_iter()
.map(|storage_path| storage.local_path(&storage_path))
.map(|storage_path| local_storage.local_path(&storage_path))
.collect::<anyhow::Result<BTreeSet<_>>>()?,
layer_files
.into_iter()
@@ -351,7 +359,11 @@ mod tests {
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let layer_files = ["a1", "b1"];
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_owned(),
harness.conf.workdir.clone(),
)?);
let local_storage = storage.as_local().unwrap();
let current_retries = 5;
let metadata = dummy_metadata(Lsn(0x40));
@@ -365,7 +377,7 @@ mod tests {
create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone())
.await?;
assert!(
storage.list().await?.is_empty(),
local_storage.list().await?.is_empty(),
"Storage should be empty before any uploads are made"
);
@@ -414,7 +426,7 @@ mod tests {
"Successful upload should not change its metadata"
);
let storage_files = storage.list().await?;
let storage_files = local_storage.list().await?;
assert_eq!(
storage_files.len(),
layer_files.len(),
@@ -423,7 +435,7 @@ mod tests {
assert_eq!(
storage_files
.into_iter()
.map(|storage_path| storage.local_path(&storage_path))
.map(|storage_path| local_storage.local_path(&storage_path))
.collect::<anyhow::Result<BTreeSet<_>>>()?,
layer_files
.into_iter()
@@ -440,7 +452,11 @@ mod tests {
let harness = RepoHarness::create("test_upload_index_part")?;
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
let storage = GenericRemoteStorage::Local(LocalFs::new(
tempdir()?.path().to_owned(),
harness.conf.workdir.clone(),
)?);
let local_storage = storage.as_local().unwrap();
let metadata = dummy_metadata(Lsn(0x40));
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
@@ -458,12 +474,12 @@ mod tests {
);
assert!(
storage.list().await?.is_empty(),
local_storage.list().await?.is_empty(),
"Storage should be empty before any uploads are made"
);
upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?;
let storage_files = storage.list().await?;
let storage_files = local_storage.list().await?;
assert_eq!(
storage_files.len(),
1,

View File

@@ -3,8 +3,8 @@
use crate::config::PageServerConf;
use crate::http::models::TenantInfo;
use crate::layered_repository::{load_metadata, LayeredRepository, LayeredTimeline};
use crate::repository::Repository;
use crate::layered_repository::{load_metadata, Repository, Timeline};
use crate::repository::RepositoryTimeline;
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
use crate::tenant_config::TenantConfOpt;
@@ -12,6 +12,7 @@ use crate::thread_mgr::ThreadKind;
use crate::walredo::PostgresRedoManager;
use crate::{thread_mgr, timelines, walreceiver};
use anyhow::Context;
use remote_storage::GenericRemoteStorage;
use serde::{Deserialize, Serialize};
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet};
@@ -21,6 +22,7 @@ use tokio::sync::mpsc;
use tracing::*;
use utils::lsn::Lsn;
pub use tenants_state::try_send_timeline_update;
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
mod tenants_state {
@@ -68,7 +70,7 @@ mod tenants_state {
Ok(())
}
pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) {
pub fn try_send_timeline_update(update: LocalTimelineUpdate) {
match TIMELINE_UPDATE_SENDER
.read()
.expect("Failed to read() timeline_update_sender lock, it got poisoned")
@@ -94,13 +96,7 @@ mod tenants_state {
struct Tenant {
state: TenantState,
/// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk.
repo: Arc<LayeredRepository>,
/// Timelines, located locally in the pageserver's datadir.
/// Timelines can entirely be removed entirely by the `detach` operation only.
///
/// Local timelines have more metadata that's loaded into memory,
/// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`].
local_timelines: HashMap<ZTimelineId, Arc<LayeredTimeline>>,
repo: Arc<Repository>,
}
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
@@ -136,7 +132,10 @@ impl fmt::Display for TenantState {
/// Initialize repositories with locally available timelines.
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
/// are scheduled for download and added to the repository once download is completed.
pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIndex> {
pub fn init_tenant_mgr(
conf: &'static PageServerConf,
remote_storage: Option<Arc<GenericRemoteStorage>>,
) -> anyhow::Result<RemoteIndex> {
let (timeline_updates_sender, timeline_updates_receiver) =
mpsc::unbounded_channel::<LocalTimelineUpdate>();
tenants_state::set_timeline_update_sender(timeline_updates_sender)?;
@@ -145,7 +144,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
let SyncStartupData {
remote_index,
local_timeline_init_statuses,
} = storage_sync::start_local_timeline_sync(conf)
} = storage_sync::start_local_timeline_sync(conf, remote_storage)
.context("Failed to set up local files sync with external storage")?;
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
@@ -177,15 +176,15 @@ pub enum LocalTimelineUpdate {
},
Attach {
id: ZTenantTimelineId,
datadir: Arc<LayeredTimeline>,
timeline: Arc<Timeline>,
},
}
impl std::fmt::Debug for LocalTimelineUpdate {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(),
Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(),
Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(),
Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(),
}
}
}
@@ -289,7 +288,6 @@ pub fn create_tenant_repository(
v.insert(Tenant {
state: TenantState::Idle,
repo,
local_timelines: HashMap::new(),
});
Ok(Some(tenant_id))
}
@@ -365,7 +363,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
Ok(())
}
pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<LayeredRepository>> {
pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<Repository>> {
let m = tenants_state::read_tenants();
let tenant = m
.get(&tenant_id)
@@ -379,21 +377,15 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<Lay
pub fn get_local_timeline_with_load(
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
) -> anyhow::Result<Arc<LayeredTimeline>> {
let mut m = tenants_state::write_tenants();
let tenant = m
.get_mut(&tenant_id)
.with_context(|| format!("Tenant {tenant_id} not found"))?;
if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) {
Ok(Arc::clone(page_tline))
} else {
let page_tline = load_local_timeline(&tenant.repo, timeline_id)
.with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?;
tenant
.local_timelines
.insert(timeline_id, Arc::clone(&page_tline));
Ok(page_tline)
) -> anyhow::Result<Arc<Timeline>> {
let repository = get_repository_for_tenant(tenant_id)?;
match repository.get_timeline(timeline_id) {
Some(RepositoryTimeline::Loaded(loaded_timeline)) => {
loaded_timeline.init_logical_size()?;
Ok(loaded_timeline)
}
_ => load_local_timeline(&repository, timeline_id)
.with_context(|| format!("Failed to load local timeline for tenant {tenant_id}")),
}
}
@@ -420,10 +412,7 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow
thread_mgr::shutdown_threads(None, None, Some(timeline_id));
debug!("thread shutdown completed");
match tenants_state::write_tenants().get_mut(&tenant_id) {
Some(tenant) => {
tenant.repo.delete_timeline(timeline_id)?;
tenant.local_timelines.remove(&timeline_id);
}
Some(tenant) => tenant.repo.delete_timeline(timeline_id)?,
None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"),
}
@@ -435,23 +424,21 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any
// shutdown the tenant and timeline threads: gc, compaction, page service threads)
thread_mgr::shutdown_threads(None, Some(tenant_id), None);
// FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state?
// send stop signal to wal receiver and collect join handles while holding the lock
let walreceiver_join_handles = {
let tenants = tenants_state::write_tenants();
let tenant = tenants.get(&tenant_id).context("tenant not found")?;
let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len());
for timeline_id in tenant.local_timelines.keys() {
let mut walreceiver_join_handles = Vec::new();
let removed_tenant = {
let mut tenants_accessor = tenants_state::write_tenants();
tenants_accessor.remove(&tenant_id)
};
if let Some(tenant) = removed_tenant {
for (timeline_id, _) in tenant.repo.list_timelines() {
let (sender, receiver) = std::sync::mpsc::channel::<()>();
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
id: ZTenantTimelineId::new(tenant_id, *timeline_id),
id: ZTenantTimelineId::new(tenant_id, timeline_id),
join_confirmation_sender: sender,
});
walreceiver_join_handles.push((*timeline_id, receiver));
walreceiver_join_handles.push((timeline_id, receiver));
}
// drop the tenants lock
walreceiver_join_handles
};
}
// wait for wal receivers to stop without holding the lock, because walreceiver
// will attempt to change tenant state which is protected by the same global tenants lock.
@@ -484,19 +471,13 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any
}
fn load_local_timeline(
repo: &LayeredRepository,
repo: &Repository,
timeline_id: ZTimelineId,
) -> anyhow::Result<Arc<LayeredTimeline>> {
) -> anyhow::Result<Arc<Timeline>> {
let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| {
format!("Inmem timeline {timeline_id} not found in tenant's repository")
})?;
inmem_timeline.init_logical_size()?;
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach {
id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
datadir: Arc::clone(&inmem_timeline),
});
Ok(inmem_timeline)
}
@@ -588,37 +569,24 @@ fn init_local_repository(
}
fn attach_downloaded_tenant(
repo: &LayeredRepository,
repo: &Repository,
downloaded_timelines: HashSet<ZTimelineId>,
) -> anyhow::Result<()> {
let mut registration_queue = Vec::with_capacity(downloaded_timelines.len());
// first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
for timeline_id in downloaded_timelines {
// first, register timeline metadata to ensure ancestors will be found later during layer load
for &timeline_id in &downloaded_timelines {
repo.attach_timeline(timeline_id).with_context(|| {
format!("Failed to load timeline {timeline_id} into in-memory repository")
})?;
registration_queue.push(timeline_id);
}
for timeline_id in registration_queue {
let tenant_id = repo.tenant_id();
match tenants_state::write_tenants().get_mut(&tenant_id) {
Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
Entry::Occupied(_) => {
anyhow::bail!("Local timeline {timeline_id} already registered")
}
Entry::Vacant(v) => {
v.insert(load_local_timeline(repo, timeline_id).with_context(|| {
format!("Failed to register add local timeline for tenant {tenant_id}")
})?);
}
},
None => anyhow::bail!(
"Tenant {} not found in local tenant state",
repo.tenant_id()
),
}
// and then load its layers in memory
for timeline_id in downloaded_timelines {
let _ = load_local_timeline(repo, timeline_id).with_context(|| {
format!(
"Failed to register add local timeline for tenant {}",
repo.tenant_id(),
)
})?;
}
Ok(())
@@ -630,14 +598,14 @@ fn load_local_repo(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
remote_index: &RemoteIndex,
) -> anyhow::Result<Arc<LayeredRepository>> {
) -> anyhow::Result<Arc<Repository>> {
let mut m = tenants_state::write_tenants();
let tenant = m.entry(tenant_id).or_insert_with(|| {
// Set up a WAL redo manager, for applying WAL records.
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
// Set up an object repository, for actual data storage.
let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
let repo: Arc<Repository> = Arc::new(Repository::new(
conf,
TenantConfOpt::default(),
Arc::new(walredo_mgr),
@@ -648,12 +616,11 @@ fn load_local_repo(
Tenant {
state: TenantState::Idle,
repo,
local_timelines: HashMap::new(),
}
});
// Restore tenant config
let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?;
let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?;
tenant.repo.update_tenant_config(tenant_conf)?;
Ok(Arc::clone(&tenant.repo))

View File

@@ -5,7 +5,6 @@ use std::collections::HashMap;
use std::ops::ControlFlow;
use std::time::Duration;
use crate::repository::Repository;
use crate::tenant_mgr::TenantState;
use crate::thread_mgr::ThreadKind;
use crate::{tenant_mgr, thread_mgr};

View File

@@ -20,15 +20,14 @@ use utils::{
use crate::import_datadir;
use crate::tenant_mgr;
use crate::CheckpointConfig;
use crate::{
config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex,
tenant_config::TenantConfOpt,
config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt,
};
use crate::{
layered_repository::{LayeredRepository, LayeredTimeline},
layered_repository::{Repository, Timeline},
walredo::WalRedoManager,
};
use crate::{repository::Timeline, CheckpointConfig};
#[derive(Debug, Clone, Copy)]
pub struct PointInTime {
@@ -42,7 +41,7 @@ pub fn create_repo(
tenant_id: ZTenantId,
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
remote_index: RemoteIndex,
) -> Result<Arc<LayeredRepository>> {
) -> Result<Arc<Repository>> {
let repo_dir = conf.tenant_path(&tenant_id);
ensure!(
!repo_dir.exists(),
@@ -57,9 +56,9 @@ pub fn create_repo(
info!("created directory structure in {}", repo_dir.display());
// Save tenant's config
LayeredRepository::persist_tenant_config(conf, tenant_id, tenant_conf)?;
Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?;
Ok(Arc::new(LayeredRepository::new(
Ok(Arc::new(Repository::new(
conf,
tenant_conf,
wal_redo_manager,
@@ -104,11 +103,11 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
// - run initdb to init temporary instance and get bootstrap data
// - after initialization complete, remove the temp dir.
//
fn bootstrap_timeline<R: Repository>(
fn bootstrap_timeline(
conf: &'static PageServerConf,
tenantid: ZTenantId,
tli: ZTimelineId,
repo: &R,
repo: &Repository,
) -> Result<()> {
let initdb_path = conf
.tenant_path(&tenantid)
@@ -160,7 +159,7 @@ pub(crate) fn create_timeline(
new_timeline_id: Option<ZTimelineId>,
ancestor_timeline_id: Option<ZTimelineId>,
mut ancestor_start_lsn: Option<Lsn>,
) -> Result<Option<(ZTimelineId, Arc<LayeredTimeline>)>> {
) -> Result<Option<(ZTimelineId, Arc<Timeline>)>> {
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;

View File

@@ -30,6 +30,7 @@ use anyhow::Result;
use bytes::{Buf, Bytes, BytesMut};
use tracing::*;
use crate::layered_repository::Timeline;
use crate::pgdatadir_mapping::*;
use crate::reltag::{RelTag, SlruKind};
use crate::walrecord::*;
@@ -43,15 +44,15 @@ use utils::lsn::Lsn;
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
pub struct WalIngest<'a, T: DatadirTimeline> {
timeline: &'a T,
pub struct WalIngest<'a> {
timeline: &'a Timeline,
checkpoint: CheckPoint,
checkpoint_modified: bool,
}
impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
pub fn new(timeline: &T, startpoint: Lsn) -> Result<WalIngest<T>> {
impl<'a> WalIngest<'a> {
pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result<WalIngest> {
// Fetch the latest checkpoint into memory, so that we can compare with it
// quickly in `ingest_record` and update it when it changes.
let checkpoint_bytes = timeline.get_checkpoint(startpoint)?;
@@ -77,7 +78,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
&mut self,
recdata: Bytes,
lsn: Lsn,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
decoded: &mut DecodedWALRecord,
) -> Result<()> {
modification.lsn = lsn;
@@ -266,7 +267,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn ingest_decoded_block(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
lsn: Lsn,
decoded: &DecodedWALRecord,
blk: &DecodedBkpBlock,
@@ -326,7 +327,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn ingest_heapam_record(
&mut self,
buf: &mut Bytes,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
decoded: &mut DecodedWALRecord,
) -> Result<()> {
// Handle VM bit updates that are implicitly part of heap records.
@@ -470,7 +471,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
/// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
fn ingest_xlog_dbase_create(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rec: &XlCreateDatabase,
) -> Result<()> {
let db_id = rec.db_id;
@@ -537,7 +538,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn ingest_xlog_smgr_create(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rec: &XlSmgrCreate,
) -> Result<()> {
let rel = RelTag {
@@ -555,7 +556,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
/// This is the same logic as in PostgreSQL's smgr_redo() function.
fn ingest_xlog_smgr_truncate(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rec: &XlSmgrTruncate,
) -> Result<()> {
let spcnode = rec.rnode.spcnode;
@@ -620,7 +621,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
///
fn ingest_xact_record(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
parsed: &XlXactParsedRecord,
is_commit: bool,
) -> Result<()> {
@@ -689,7 +690,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn ingest_clog_truncate_record(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
xlrec: &XlClogTruncate,
) -> Result<()> {
info!(
@@ -747,7 +748,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn ingest_multixact_create_record(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
xlrec: &XlMultiXactCreate,
) -> Result<()> {
// Create WAL record for updating the multixact-offsets page
@@ -826,7 +827,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn ingest_multixact_truncate_record(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
xlrec: &XlMultiXactTruncate,
) -> Result<()> {
self.checkpoint.oldestMulti = xlrec.end_trunc_off;
@@ -860,7 +861,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn ingest_relmap_page(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
xlrec: &XlRelmapUpdate,
decoded: &DecodedWALRecord,
) -> Result<()> {
@@ -876,7 +877,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn put_rel_creation(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rel: RelTag,
) -> Result<()> {
modification.put_rel_creation(rel, 0)?;
@@ -885,7 +886,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn put_rel_page_image(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rel: RelTag,
blknum: BlockNumber,
img: Bytes,
@@ -897,7 +898,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn put_rel_wal_record(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rel: RelTag,
blknum: BlockNumber,
rec: ZenithWalRecord,
@@ -909,7 +910,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn put_rel_truncation(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rel: RelTag,
nblocks: BlockNumber,
) -> Result<()> {
@@ -917,11 +918,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
Ok(())
}
fn put_rel_drop(
&mut self,
modification: &mut DatadirModification<T>,
rel: RelTag,
) -> Result<()> {
fn put_rel_drop(&mut self, modification: &mut DatadirModification, rel: RelTag) -> Result<()> {
modification.put_rel_drop(rel)?;
Ok(())
}
@@ -937,7 +934,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn handle_rel_extend(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
rel: RelTag,
blknum: BlockNumber,
) -> Result<()> {
@@ -968,7 +965,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn put_slru_page_image(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
@@ -981,7 +978,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
fn handle_slru_extend(
&mut self,
modification: &mut DatadirModification<T>,
modification: &mut DatadirModification,
kind: SlruKind,
segno: u32,
blknum: BlockNumber,
@@ -1032,9 +1029,9 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
#[cfg(test)]
mod tests {
use super::*;
use crate::layered_repository::repo_harness::*;
use crate::layered_repository::Timeline;
use crate::pgdatadir_mapping::create_test_timeline;
use crate::repository::repo_harness::*;
use crate::repository::Timeline;
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
use postgres_ffi::RELSEG_SIZE;
@@ -1046,13 +1043,13 @@ mod tests {
forknum: 0,
};
fn assert_current_logical_size<T: Timeline>(_timeline: &T, _lsn: Lsn) {
fn assert_current_logical_size(_timeline: &Timeline, _lsn: Lsn) {
// TODO
}
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
fn init_walingest_test<T: DatadirTimeline>(tline: &T) -> Result<WalIngest<T>> {
fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
let mut m = tline.begin_modification(Lsn(0x10));
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
@@ -1065,7 +1062,7 @@ mod tests {
#[test]
fn test_relsize() -> Result<()> {
let repo = RepoHarness::create("test_relsize")?.load();
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
let mut m = tline.begin_modification(Lsn(0x20));
@@ -1193,7 +1190,7 @@ mod tests {
#[test]
fn test_drop_extend() -> Result<()> {
let repo = RepoHarness::create("test_drop_extend")?.load();
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
let mut m = tline.begin_modification(Lsn(0x20));
@@ -1233,7 +1230,7 @@ mod tests {
#[test]
fn test_truncate_extend() -> Result<()> {
let repo = RepoHarness::create("test_truncate_extend")?.load();
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
// Create a 20 MB relation (the size is arbitrary)
@@ -1321,7 +1318,7 @@ mod tests {
#[test]
fn test_large_rel() -> Result<()> {
let repo = RepoHarness::create("test_large_rel")?.load();
let tline = create_test_timeline(repo, TIMELINE_ID)?;
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
let mut walingest = init_walingest_test(&*tline)?;
let mut lsn = 0x10;

View File

@@ -269,7 +269,7 @@ async fn wal_receiver_main_thread_loop_step<'a>(
}
}
// Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly.
LocalTimelineUpdate::Attach { id, datadir } => {
LocalTimelineUpdate::Attach { id, timeline } => {
let timeline_connection_managers = local_timeline_wal_receivers
.entry(id.tenant_id)
.or_default();
@@ -305,7 +305,7 @@ async fn wal_receiver_main_thread_loop_step<'a>(
id,
broker_prefix.to_owned(),
etcd_client.clone(),
datadir,
timeline,
wal_connect_timeout,
lagging_wal_timeout,
max_lsn_wal_lag,

View File

@@ -16,7 +16,7 @@ use std::{
time::Duration,
};
use crate::{layered_repository::LayeredTimeline, repository::Timeline};
use crate::layered_repository::Timeline;
use anyhow::Context;
use chrono::{NaiveDateTime, Utc};
use etcd_broker::{
@@ -39,7 +39,7 @@ pub(super) fn spawn_connection_manager_task(
id: ZTenantTimelineId,
broker_loop_prefix: String,
mut client: Client,
local_timeline: Arc<LayeredTimeline>,
local_timeline: Arc<Timeline>,
wal_connect_timeout: Duration,
lagging_wal_timeout: Duration,
max_lsn_wal_lag: NonZeroU64,
@@ -242,7 +242,7 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5;
struct WalreceiverState {
id: ZTenantTimelineId,
/// Use pageserver data about the timeline to filter out some of the safekeepers.
local_timeline: Arc<LayeredTimeline>,
local_timeline: Arc<Timeline>,
/// The timeout on the connection to safekeeper for WAL streaming.
wal_connect_timeout: Duration,
/// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
@@ -300,7 +300,7 @@ struct EtcdSkTimeline {
impl WalreceiverState {
fn new(
id: ZTenantTimelineId,
local_timeline: Arc<LayeredTimeline>,
local_timeline: Arc<Timeline>,
wal_connect_timeout: Duration,
lagging_wal_timeout: Duration,
max_lsn_wal_lag: NonZeroU64,
@@ -735,12 +735,8 @@ fn wal_stream_connection_string(
#[cfg(test)]
mod tests {
use crate::repository::{
repo_harness::{RepoHarness, TIMELINE_ID},
Repository,
};
use super::*;
use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID};
#[test]
fn no_connection_no_candidate() -> anyhow::Result<()> {

View File

@@ -20,11 +20,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument};
use super::TaskEvent;
use crate::{
layered_repository::WalReceiverInfo,
pgdatadir_mapping::DatadirTimeline,
repository::{Repository, Timeline},
tenant_mgr,
walingest::WalIngest,
layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest,
walrecord::DecodedWALRecord,
};
use postgres_ffi::v14::waldecoder::WalStreamDecoder;
@@ -67,7 +63,7 @@ pub async fn handle_walreceiver_connection(
)
.await
.context("Timed out while waiting for walreceiver connection to open")?
.context("Failed to open walreceiver conection")?;
.context("Failed to open walreceiver connection")?;
info!("connected!");
let mut connection_status = WalConnectionStatus {

View File

@@ -89,15 +89,52 @@ pub trait WalRedoManager: Send + Sync {
// for access to the postgres process ('wait') since there is only one for
// each tenant.
/// Time buckets are small because we want to be able to measure the
/// smallest redo processing times. These buckets allow us to measure down
/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
macro_rules! redo_histogram_time_buckets {
() => {
vec![
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
]
};
}
/// While we're at it, also measure the amount of records replayed in each
/// operation. We have a global 'total replayed' counter, but that's not
/// as useful as 'what is the skew for how many records we replay in one
/// operation'.
macro_rules! redo_histogram_count_buckets {
() => {
vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
};
}
static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
.expect("failed to define a metric")
register_histogram!(
"pageserver_wal_redo_seconds",
"Time spent on WAL redo",
redo_histogram_time_buckets!()
)
.expect("failed to define a metric")
});
static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_wait_seconds",
"Time spent waiting for access to the WAL redo process"
"Time spent waiting for access to the WAL redo process",
redo_histogram_time_buckets!(),
)
.expect("failed to define a metric")
});
static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"pageserver_wal_redo_records_histogram",
"Histogram of number of records replayed per redo",
redo_histogram_count_buckets!(),
)
.expect("failed to define a metric")
});
@@ -262,7 +299,10 @@ impl PostgresRedoManager {
let end_time = Instant::now();
let duration = end_time.duration_since(lock_time);
WAL_REDO_TIME.observe(duration.as_secs_f64());
WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64);
debug!(
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
records.len(),

26
pgxn/neon/Makefile Normal file
View File

@@ -0,0 +1,26 @@
# pgxs/neon/Makefile
MODULE_big = neon
OBJS = \
$(WIN32RES) \
inmem_smgr.o \
libpagestore.o \
libpqwalproposer.o \
pagestore_smgr.o \
relsize_cache.o \
neon.o \
walproposer.o \
walproposer_utils.o
PG_CPPFLAGS = -I$(libpq_srcdir)
SHLIB_LINK_INTERNAL = $(libpq)
EXTENSION = neon
DATA = neon--1.0.sql
PGFILEDESC = "neon - cloud storage for PostgreSQL"
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

286
pgxn/neon/inmem_smgr.c Normal file
View File

@@ -0,0 +1,286 @@
/*-------------------------------------------------------------------------
*
* inmem_smgr.c
*
* This is an implementation of the SMGR interface, used in the WAL redo
* process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
* storage, the pages that are written out are kept in a small number of
* in-memory buffers.
*
* Normally, replaying a WAL record only needs to access a handful of
* buffers, which fit in the normal buffer cache, so this is just for
* "overflow" storage when the buffer cache is not large enough.
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* contrib/neon/inmem_smgr.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/xlog.h"
#include "pagestore_client.h"
#include "storage/block.h"
#include "storage/buf_internals.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
/* Size of the in-memory smgr */
#define MAX_PAGES 64
/* If more than WARN_PAGES are used, print a warning in the log */
#define WARN_PAGES 32
static BufferTag page_tag[MAX_PAGES];
static char page_body[MAX_PAGES][BLCKSZ];
static int used_pages;
static int
locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
{
/* We only hold a small number of pages, so linear search */
for (int i = 0; i < used_pages; i++)
{
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
&& forknum == page_tag[i].forkNum
&& blkno == page_tag[i].blockNum)
{
return i;
}
}
return -1;
}
/*
* inmem_init() -- Initialize private state
*/
void
inmem_init(void)
{
used_pages = 0;
}
/*
* inmem_exists() -- Does the physical file exist?
*/
bool
inmem_exists(SMgrRelation reln, ForkNumber forknum)
{
for (int i = 0; i < used_pages; i++)
{
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
&& forknum == page_tag[i].forkNum)
{
return true;
}
}
return false;
}
/*
* inmem_create() -- Create a new relation on zenithd storage
*
* If isRedo is true, it's okay for the relation to exist already.
*/
void
inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
{
}
/*
* inmem_unlink() -- Unlink a relation.
*/
void
inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
{
}
/*
* inmem_extend() -- Add a block to the specified relation.
*
* The semantics are nearly the same as mdwrite(): write at the
* specified position. However, this is to be used for the case of
* extending a relation (i.e., blocknum is at or beyond the current
* EOF). Note that we assume writing a block beyond current EOF
* causes intervening file space to become filled with zeroes.
*/
void
inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
char *buffer, bool skipFsync)
{
/* same as smgwrite() for us */
inmem_write(reln, forknum, blkno, buffer, skipFsync);
}
/*
* inmem_open() -- Initialize newly-opened relation.
*/
void
inmem_open(SMgrRelation reln)
{
}
/*
* inmem_close() -- Close the specified relation, if it isn't closed already.
*/
void
inmem_close(SMgrRelation reln, ForkNumber forknum)
{
}
/*
* inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
*/
bool
inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
return true;
}
/*
* inmem_writeback() -- Tell the kernel to write pages back to storage.
*/
void
inmem_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks)
{
}
/*
* inmem_read() -- Read the specified block from a relation.
*/
void
inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
char *buffer)
{
int pg;
pg = locate_page(reln, forknum, blkno);
if (pg < 0)
memset(buffer, 0, BLCKSZ);
else
memcpy(buffer, page_body[pg], BLCKSZ);
}
/*
* inmem_write() -- Write the supplied block at the appropriate location.
*
* This is to be used only for updating already-existing blocks of a
* relation (ie, those before the current EOF). To extend a relation,
* use mdextend().
*/
void
inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer, bool skipFsync)
{
int pg;
pg = locate_page(reln, forknum, blocknum);
if (pg < 0)
{
/*
* We assume the buffer cache is large enough to hold all the buffers
* needed for most operations. Overflowing to this "in-mem smgr" in rare
* cases is OK. But if we find that we're using more than WARN_PAGES,
* print a warning so that we get alerted and get to investigate why
* we're accessing so many buffers.
*/
elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
"inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode,
forknum,
blocknum,
used_pages);
if (used_pages == MAX_PAGES)
elog(ERROR, "Inmem storage overflow");
pg = used_pages;
used_pages++;
INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
} else {
elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
reln->smgr_rnode.node.spcNode,
reln->smgr_rnode.node.dbNode,
reln->smgr_rnode.node.relNode,
forknum,
blocknum,
used_pages);
}
memcpy(page_body[pg], buffer, BLCKSZ);
}
/*
* inmem_nblocks() -- Get the number of blocks stored in a relation.
*/
BlockNumber
inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
{
/*
* It's not clear why a WAL redo function would call smgrnblocks().
* During recovery, at least before reaching consistency, the size of a
* relation could be arbitrarily small, if it was truncated after the
* record being replayed, or arbitrarily large if it was extended
* afterwards. But one place where it's called is in
* XLogReadBufferExtended(): it extends the relation, if it's smaller than
* the requested page. That's a waste of time in the WAL redo
* process. Pretend that all relations are maximally sized to avoid it.
*/
return MaxBlockNumber;
}
/*
* inmem_truncate() -- Truncate relation to specified number of blocks.
*/
void
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
{
}
/*
* inmem_immedsync() -- Immediately sync a relation to stable storage.
*/
void
inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
{
}
static const struct f_smgr inmem_smgr =
{
.smgr_init = inmem_init,
.smgr_shutdown = NULL,
.smgr_open = inmem_open,
.smgr_close = inmem_close,
.smgr_create = inmem_create,
.smgr_exists = inmem_exists,
.smgr_unlink = inmem_unlink,
.smgr_extend = inmem_extend,
.smgr_prefetch = inmem_prefetch,
.smgr_read = inmem_read,
.smgr_write = inmem_write,
.smgr_writeback = inmem_writeback,
.smgr_nblocks = inmem_nblocks,
.smgr_truncate = inmem_truncate,
.smgr_immedsync = inmem_immedsync,
};
const f_smgr *
smgr_inmem(BackendId backend, RelFileNode rnode)
{
Assert(InRecovery);
if (backend != InvalidBackendId)
return smgr_standard(backend, rnode);
else
return &inmem_smgr;
}
void
smgr_init_inmem()
{
inmem_init();
}

432
pgxn/neon/libpagestore.c Normal file
View File

@@ -0,0 +1,432 @@
/*-------------------------------------------------------------------------
*
* libpagestore.c
* Handles network communications with the remote pagestore.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* contrib/neon/libpqpagestore.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "pagestore_client.h"
#include "fmgr.h"
#include "access/xlog.h"
#include "libpq-fe.h"
#include "libpq/pqformat.h"
#include "libpq/libpq.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "utils/guc.h"
#include "neon.h"
#include "walproposer.h"
#include "walproposer_utils.h"
#define PageStoreTrace DEBUG5
#define NEON_TAG "[NEON_SMGR] "
#define neon_log(tag, fmt, ...) ereport(tag, \
(errmsg(NEON_TAG fmt, ## __VA_ARGS__), \
errhidestmt(true), errhidecontext(true)))
bool connected = false;
PGconn *pageserver_conn = NULL;
char *page_server_connstring_raw;
static ZenithResponse *pageserver_call(ZenithRequest *request);
page_server_api api = {
.request = pageserver_call
};
static void
pageserver_connect()
{
char *query;
int ret;
Assert(!connected);
pageserver_conn = PQconnectdb(page_server_connstring);
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
PQfinish(pageserver_conn);
pageserver_conn = NULL;
ereport(ERROR,
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
errmsg(NEON_TAG "could not establish connection to pageserver"),
errdetail_internal("%s", msg)));
}
query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
ret = PQsendQuery(pageserver_conn, query);
if (ret != 1)
{
PQfinish(pageserver_conn);
pageserver_conn = NULL;
neon_log(ERROR, "could not send pagestream command to pageserver");
}
while (PQisBusy(pageserver_conn))
{
int wc;
/* Sleep until there's something to do */
wc = WaitLatchOrSocket(MyLatch,
WL_LATCH_SET | WL_SOCKET_READABLE |
WL_EXIT_ON_PM_DEATH,
PQsocket(pageserver_conn),
-1L, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (wc & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(pageserver_conn))
{
char *msg = pchomp(PQerrorMessage(pageserver_conn));
PQfinish(pageserver_conn);
pageserver_conn = NULL;
neon_log(ERROR, "could not complete handshake with pageserver: %s",
msg);
}
}
}
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
connected = true;
}
/*
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
*/
static int
call_PQgetCopyData(PGconn *conn, char **buffer)
{
int ret;
retry:
ret = PQgetCopyData(conn, buffer, 1 /* async */ );
if (ret == 0)
{
int wc;
/* Sleep until there's something to do */
wc = WaitLatchOrSocket(MyLatch,
WL_LATCH_SET | WL_SOCKET_READABLE |
WL_EXIT_ON_PM_DEATH,
PQsocket(conn),
-1L, PG_WAIT_EXTENSION);
ResetLatch(MyLatch);
CHECK_FOR_INTERRUPTS();
/* Data available in socket? */
if (wc & WL_SOCKET_READABLE)
{
if (!PQconsumeInput(conn))
neon_log(ERROR, "could not get response from pageserver: %s",
PQerrorMessage(conn));
}
goto retry;
}
return ret;
}
static ZenithResponse *
pageserver_call(ZenithRequest *request)
{
StringInfoData req_buff;
StringInfoData resp_buff;
ZenithResponse *resp;
PG_TRY();
{
/* If the connection was lost for some reason, reconnect */
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
{
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
}
if (!connected)
pageserver_connect();
req_buff = zm_pack_request(request);
/*
* Send request.
*
* In principle, this could block if the output buffer is full, and we
* should use async mode and check for interrupts while waiting. In
* practice, our requests are small enough to always fit in the output
* and TCP buffer.
*/
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
{
neon_log(ERROR, "failed to send page request: %s",
PQerrorMessage(pageserver_conn));
}
pfree(req_buff.data);
if (message_level_is_interesting(PageStoreTrace))
{
char *msg = zm_to_string((ZenithMessage *) request);
neon_log(PageStoreTrace, "sent request: %s", msg);
pfree(msg);
}
/* read response */
resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
resp_buff.cursor = 0;
if (resp_buff.len == -1)
neon_log(ERROR, "end of COPY");
else if (resp_buff.len == -2)
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
resp = zm_unpack_response(&resp_buff);
PQfreemem(resp_buff.data);
if (message_level_is_interesting(PageStoreTrace))
{
char *msg = zm_to_string((ZenithMessage *) resp);
neon_log(PageStoreTrace, "got response: %s", msg);
pfree(msg);
}
}
PG_CATCH();
{
/*
* If anything goes wrong while we were sending a request, it's not
* clear what state the connection is in. For example, if we sent the
* request but didn't receive a response yet, we might receive the
* response some time later after we have already sent a new unrelated
* request. Close the connection to avoid getting confused.
*/
if (connected)
{
neon_log(LOG, "dropping connection to page server due to error");
PQfinish(pageserver_conn);
pageserver_conn = NULL;
connected = false;
}
PG_RE_THROW();
}
PG_END_TRY();
return (ZenithResponse *) resp;
}
static bool
check_zenith_id(char **newval, void **extra, GucSource source)
{
uint8 zid[16];
return **newval == '\0' || HexDecodeString(zid, *newval, 16);
}
static char *
substitute_pageserver_password(const char *page_server_connstring_raw)
{
char *host = NULL;
char *port = NULL;
char *user = NULL;
char *auth_token = NULL;
char *err = NULL;
char *page_server_connstring = NULL;
PQconninfoOption *conn_options;
PQconninfoOption *conn_option;
MemoryContext oldcontext;
/*
* Here we substitute password in connection string with an environment
* variable. To simplify things we construct a connection string back with
* only known options. In particular: host port user and password. We do
* not currently use other options and constructing full connstring in an
* URI shape is quite messy.
*/
if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
return NULL;
/* extract the auth token from the connection string */
conn_options = PQconninfoParse(page_server_connstring_raw, &err);
if (conn_options == NULL)
{
/* The error string is malloc'd, so we must free it explicitly */
char *errcopy = err ? pstrdup(err) : "out of memory";
PQfreemem(err);
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid connection string syntax: %s", errcopy)));
}
/*
* Trying to populate pageserver connection string with auth token from
* environment. We are looking for password in with placeholder value like
* $ENV_VAR_NAME, so if password field is present and starts with $ we try
* to fetch environment variable value and fail loudly if it is not set.
*/
for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
{
if (strcmp(conn_option->keyword, "host") == 0)
{
if (conn_option->val != NULL && conn_option->val[0] != '\0')
host = conn_option->val;
}
else if (strcmp(conn_option->keyword, "port") == 0)
{
if (conn_option->val != NULL && conn_option->val[0] != '\0')
port = conn_option->val;
}
else if (strcmp(conn_option->keyword, "user") == 0)
{
if (conn_option->val != NULL && conn_option->val[0] != '\0')
user = conn_option->val;
}
else if (strcmp(conn_option->keyword, "password") == 0)
{
if (conn_option->val != NULL && conn_option->val[0] != '\0')
{
/* ensure that this is a template */
if (strncmp(conn_option->val, "$", 1) != 0)
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_EXCEPTION),
errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
auth_token = getenv(&conn_option->val[1]);
if (!auth_token)
{
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_EXCEPTION),
errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
}
else
{
neon_log(LOG, "using auth token from environment passed via env");
}
}
}
}
/*
* allocate connection string in TopMemoryContext to make sure it is not
* freed
*/
oldcontext = CurrentMemoryContext;
MemoryContextSwitchTo(TopMemoryContext);
page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
MemoryContextSwitchTo(oldcontext);
PQconninfoFree(conn_options);
return page_server_connstring;
}
/*
* Module initialization function
*/
void
pg_init_libpagestore(void)
{
DefineCustomStringVariable("neon.pageserver_connstring",
"connection string to the page server",
NULL,
&page_server_connstring_raw,
"",
PGC_POSTMASTER,
0, /* no flags required */
NULL, NULL, NULL);
DefineCustomStringVariable("neon.timeline_id",
"Zenith timelineid the server is running on",
NULL,
&zenith_timeline,
"",
PGC_POSTMASTER,
0, /* no flags required */
check_zenith_id, NULL, NULL);
DefineCustomStringVariable("neon.tenant_id",
"Neon tenantid the server is running on",
NULL,
&zenith_tenant,
"",
PGC_POSTMASTER,
0, /* no flags required */
check_zenith_id, NULL, NULL);
DefineCustomBoolVariable("neon.wal_redo",
"start in wal-redo mode",
NULL,
&wal_redo,
false,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
DefineCustomIntVariable("neon.max_cluster_size",
"cluster size limit",
NULL,
&max_cluster_size,
-1, -1, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_MB,
NULL, NULL, NULL);
relsize_hash_init();
if (page_server != NULL)
neon_log(ERROR, "libpagestore already loaded");
neon_log(PageStoreTrace, "libpagestore already loaded");
page_server = &api;
/* substitute password in pageserver_connstring */
page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
/* Is there more correct way to pass CustomGUC to postgres code? */
zenith_timeline_walproposer = zenith_timeline;
zenith_tenant_walproposer = zenith_tenant;
if (wal_redo)
{
neon_log(PageStoreTrace, "set inmem_smgr hook");
smgr_hook = smgr_inmem;
smgr_init_hook = smgr_init_inmem;
}
else if (page_server_connstring && page_server_connstring[0])
{
neon_log(PageStoreTrace, "set neon_smgr hook");
smgr_hook = smgr_zenith;
smgr_init_hook = smgr_init_zenith;
dbsize_hook = zenith_dbsize;
}
}

View File

@@ -0,0 +1,413 @@
#include "postgres.h"
#include "libpq-fe.h"
#include "neon.h"
#include "walproposer.h"
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
struct WalProposerConn
{
PGconn* pg_conn;
bool is_nonblocking; /* whether the connection is non-blocking */
char *recvbuf; /* last received data from libpqprop_async_read */
};
/* Prototypes for exported functions */
static char* libpqprop_error_message(WalProposerConn* conn);
static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn);
static WalProposerConn* libpqprop_connect_start(char* conninfo);
static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn);
static bool libpqprop_send_query(WalProposerConn* conn, char* query);
static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn);
static pgsocket libpqprop_socket(WalProposerConn* conn);
static int libpqprop_flush(WalProposerConn* conn);
static void libpqprop_finish(WalProposerConn* conn);
static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size);
static WalProposerFunctionsType PQWalProposerFunctions = {
libpqprop_error_message,
libpqprop_status,
libpqprop_connect_start,
libpqprop_connect_poll,
libpqprop_send_query,
libpqprop_get_query_result,
libpqprop_socket,
libpqprop_flush,
libpqprop_finish,
libpqprop_async_read,
libpqprop_async_write,
libpqprop_blocking_write,
};
/* Module initialization */
void
pg_init_libpqwalproposer(void)
{
if (WalProposerFunctions != NULL)
elog(ERROR, "libpqwalproposer already loaded");
WalProposerFunctions = &PQWalProposerFunctions;
}
/* Helper function */
static bool
ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking)
{
/* If we're already correctly blocking or nonblocking, all good */
if (is_nonblocking == conn->is_nonblocking)
return true;
/* Otherwise, set it appropriately */
if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
return false;
conn->is_nonblocking = is_nonblocking;
return true;
}
/* Exported function definitions */
static char*
libpqprop_error_message(WalProposerConn* conn)
{
return PQerrorMessage(conn->pg_conn);
}
static WalProposerConnStatusType
libpqprop_status(WalProposerConn* conn)
{
switch (PQstatus(conn->pg_conn))
{
case CONNECTION_OK:
return WP_CONNECTION_OK;
case CONNECTION_BAD:
return WP_CONNECTION_BAD;
default:
return WP_CONNECTION_IN_PROGRESS;
}
}
static WalProposerConn*
libpqprop_connect_start(char* conninfo)
{
WalProposerConn* conn;
PGconn* pg_conn;
pg_conn = PQconnectStart(conninfo);
/*
* Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the
* behavior of PQconnectStart here.
*/
if (!pg_conn)
return NULL;
/*
* And in theory this allocation can fail as well, but it's incredibly unlikely if we just
* successfully allocated a PGconn.
*
* palloc will exit on failure though, so there's not much we could do if it *did* fail.
*/
conn = palloc(sizeof(WalProposerConn));
conn->pg_conn = pg_conn;
conn->is_nonblocking = false; /* connections always start in blocking mode */
conn->recvbuf = NULL;
return conn;
}
static WalProposerConnectPollStatusType
libpqprop_connect_poll(WalProposerConn* conn)
{
WalProposerConnectPollStatusType return_val;
switch (PQconnectPoll(conn->pg_conn))
{
case PGRES_POLLING_FAILED:
return_val = WP_CONN_POLLING_FAILED;
break;
case PGRES_POLLING_READING:
return_val = WP_CONN_POLLING_READING;
break;
case PGRES_POLLING_WRITING:
return_val = WP_CONN_POLLING_WRITING;
break;
case PGRES_POLLING_OK:
return_val = WP_CONN_POLLING_OK;
break;
/* There's a comment at its source about this constant being unused. We'll expect it's never
* returned. */
case PGRES_POLLING_ACTIVE:
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
/* This return is never actually reached, but it's here to make the compiler happy */
return WP_CONN_POLLING_FAILED;
default:
Assert(false);
return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
}
return return_val;
}
static bool
libpqprop_send_query(WalProposerConn* conn, char* query)
{
/* We need to be in blocking mode for sending the query to run without
* requiring a call to PQflush */
if (!ensure_nonblocking_status(conn, false))
return false;
/* PQsendQuery returns 1 on success, 0 on failure */
if (!PQsendQuery(conn->pg_conn, query))
return false;
return true;
}
static WalProposerExecStatusType
libpqprop_get_query_result(WalProposerConn* conn)
{
PGresult* result;
WalProposerExecStatusType return_val;
/* Marker variable if we need to log an unexpected success result */
char* unexpected_success = NULL;
/* Consume any input that we might be missing */
if (!PQconsumeInput(conn->pg_conn))
return WP_EXEC_FAILED;
if (PQisBusy(conn->pg_conn))
return WP_EXEC_NEEDS_INPUT;
result = PQgetResult(conn->pg_conn);
/* PQgetResult returns NULL only if getting the result was successful & there's no more of the
* result to get. */
if (!result)
{
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
return WP_EXEC_UNEXPECTED_SUCCESS;
}
/* Helper macro to reduce boilerplate */
#define UNEXPECTED_SUCCESS(msg) \
return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
unexpected_success = msg; \
break;
switch (PQresultStatus(result))
{
/* "true" success case */
case PGRES_COPY_BOTH:
return_val = WP_EXEC_SUCCESS_COPYBOTH;
break;
/* Unexpected success case */
case PGRES_EMPTY_QUERY:
UNEXPECTED_SUCCESS("empty query return");
case PGRES_COMMAND_OK:
UNEXPECTED_SUCCESS("data-less command end");
case PGRES_TUPLES_OK:
UNEXPECTED_SUCCESS("tuples return");
case PGRES_COPY_OUT:
UNEXPECTED_SUCCESS("'Copy Out' response");
case PGRES_COPY_IN:
UNEXPECTED_SUCCESS("'Copy In' response");
case PGRES_SINGLE_TUPLE:
UNEXPECTED_SUCCESS("single tuple return");
case PGRES_PIPELINE_SYNC:
UNEXPECTED_SUCCESS("pipeline sync point");
/* Failure cases */
case PGRES_BAD_RESPONSE:
case PGRES_NONFATAL_ERROR:
case PGRES_FATAL_ERROR:
case PGRES_PIPELINE_ABORTED:
return_val = WP_EXEC_FAILED;
break;
default:
Assert(false);
return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
}
if (unexpected_success)
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
return return_val;
}
static pgsocket
libpqprop_socket(WalProposerConn* conn)
{
return PQsocket(conn->pg_conn);
}
static int
libpqprop_flush(WalProposerConn* conn)
{
return (PQflush(conn->pg_conn));
}
static void
libpqprop_finish(WalProposerConn* conn)
{
if (conn->recvbuf != NULL)
PQfreemem(conn->recvbuf);
PQfinish(conn->pg_conn);
pfree(conn);
}
/*
* Receive a message from the safekeeper.
*
* On success, the data is placed in *buf. It is valid until the next call
* to this function.
*/
static PGAsyncReadResult
libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
{
int result;
if (conn->recvbuf != NULL)
{
PQfreemem(conn->recvbuf);
conn->recvbuf = NULL;
}
/* Call PQconsumeInput so that we have the data we need */
if (!PQconsumeInput(conn->pg_conn))
{
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
}
/* The docs for PQgetCopyData list the return values as:
* 0 if the copy is still in progress, but no "complete row" is
* available
* -1 if the copy is done
* -2 if an error occured
* (> 0) if it was successful; that value is the amount transferred.
*
* The protocol we use between walproposer and safekeeper means that we
* *usually* wouldn't expect to see that the copy is done, but this can
* sometimes be triggered by the server returning an ErrorResponse (which
* also happens to have the effect that the copy is done).
*/
switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
{
case 0:
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_TRY_AGAIN;
case -1:
{
/*
* If we get -1, it's probably because of a server error; the
* safekeeper won't normally send a CopyDone message.
*
* We can check PQgetResult to make sure that the server failed;
* it'll always result in PGRES_FATAL_ERROR
*/
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
if (status != PGRES_FATAL_ERROR)
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
/* If there was actually an error, it'll be properly reported by
* calls to PQerrorMessage -- we don't have to do anything else */
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
}
case -2:
*amount = 0;
*buf = NULL;
return PG_ASYNC_READ_FAIL;
default:
/* Positive values indicate the size of the returned result */
*amount = result;
*buf = conn->recvbuf;
return PG_ASYNC_READ_SUCCESS;
}
}
static PGAsyncWriteResult
libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
{
int result;
/* If we aren't in non-blocking mode, switch to it. */
if (!ensure_nonblocking_status(conn, true))
return PG_ASYNC_WRITE_FAIL;
/* The docs for PQputcopyData list the return values as:
* 1 if the data was queued,
* 0 if it was not queued because of full buffers, or
* -1 if an error occured
*/
result = PQputCopyData(conn->pg_conn, buf, size);
/* We won't get a result of zero because walproposer always empties the
* connection's buffers before sending more */
Assert(result != 0);
switch (result)
{
case 1:
/* good -- continue */
break;
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
elog(FATAL, "invalid return %d from PQputCopyData", result);
}
/* After queueing the data, we still need to flush to get it to send.
* This might take multiple tries, but we don't want to wait around
* until it's done.
*
* PQflush has the following returns (directly quoting the docs):
* 0 if sucessful,
* 1 if it was unable to send all the data in the send queue yet
* -1 if it failed for some reason
*/
switch (result = PQflush(conn->pg_conn)) {
case 0:
return PG_ASYNC_WRITE_SUCCESS;
case 1:
return PG_ASYNC_WRITE_TRY_FLUSH;
case -1:
return PG_ASYNC_WRITE_FAIL;
default:
elog(FATAL, "invalid return %d from PQflush", result);
}
}
static bool
libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size)
{
int result;
/* If we are in non-blocking mode, switch out of it. */
if (!ensure_nonblocking_status(conn, false))
return false;
/* Ths function is very similar to libpqprop_async_write. For more
* information, refer to the comments there */
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
return false;
Assert(result == 1);
/* Because the connection is non-blocking, flushing returns 0 or -1 */
if ((result = PQflush(conn->pg_conn)) == -1)
return false;
Assert(result == 0);
return true;
}

17
pgxn/neon/neon--1.0.sql Normal file
View File

@@ -0,0 +1,17 @@
\echo Use "CREATE EXTENSION neon" to load this file. \quit
CREATE FUNCTION pg_cluster_size()
RETURNS bigint
AS 'MODULE_PATHNAME', 'pg_cluster_size'
LANGUAGE C STRICT
PARALLEL UNSAFE;
CREATE FUNCTION backpressure_lsns(
OUT received_lsn pg_lsn,
OUT disk_consistent_lsn pg_lsn,
OUT remote_consistent_lsn pg_lsn
)
RETURNS record
AS 'MODULE_PATHNAME', 'backpressure_lsns'
LANGUAGE C STRICT
PARALLEL UNSAFE;

82
pgxn/neon/neon.c Normal file
View File

@@ -0,0 +1,82 @@
/*-------------------------------------------------------------------------
*
* neon.c
* Utility functions to expose neon specific information to user
*
* IDENTIFICATION
* contrib/neon/neon.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "catalog/pg_type.h"
#include "replication/walsender.h"
#include "funcapi.h"
#include "access/htup_details.h"
#include "utils/pg_lsn.h"
#include "utils/guc.h"
#include "neon.h"
#include "walproposer.h"
PG_MODULE_MAGIC;
void _PG_init(void);
void _PG_init(void)
{
pg_init_libpagestore();
pg_init_libpqwalproposer();
pg_init_walproposer();
EmitWarningsOnPlaceholders("neon");
}
PG_FUNCTION_INFO_V1(pg_cluster_size);
PG_FUNCTION_INFO_V1(backpressure_lsns);
Datum
pg_cluster_size(PG_FUNCTION_ARGS)
{
int64 size;
size = GetZenithCurrentClusterSize();
if (size == 0)
PG_RETURN_NULL();
PG_RETURN_INT64(size);
}
Datum
backpressure_lsns(PG_FUNCTION_ARGS)
{
XLogRecPtr writePtr;
XLogRecPtr flushPtr;
XLogRecPtr applyPtr;
Datum values[3];
bool nulls[3];
TupleDesc tupdesc;
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
tupdesc = CreateTemplateTupleDesc(3);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
MemSet(nulls, 0, sizeof(nulls));
values[0] = LSNGetDatum(writePtr);
values[1] = LSNGetDatum(flushPtr);
values[2] = LSNGetDatum(applyPtr);
PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
}

4
pgxn/neon/neon.control Normal file
View File

@@ -0,0 +1,4 @@
# neon extension
comment = 'cloud storage for PostgreSQL'
default_version = '1.0'
module_pathname = '$libdir/neon'

19
pgxn/neon/neon.h Normal file
View File

@@ -0,0 +1,19 @@
/*-------------------------------------------------------------------------
*
* neon.h
* Functions used in the initialization of this extension.
*
* IDENTIFICATION
* contrib/neon/neon.h
*
*-------------------------------------------------------------------------
*/
#ifndef NEON_H
#define NEON_H
extern void pg_init_libpagestore(void);
extern void pg_init_libpqwalproposer(void);
extern void pg_init_walproposer(void);
#endif /* NEON_H */

View File

@@ -0,0 +1,221 @@
/*-------------------------------------------------------------------------
*
* pagestore_client.h
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* contrib/neon/pagestore_client.h
*
*-------------------------------------------------------------------------
*/
#ifndef pageserver_h
#define pageserver_h
#include "postgres.h"
#include "access/xlogdefs.h"
#include "storage/relfilenode.h"
#include "storage/block.h"
#include "storage/smgr.h"
#include "lib/stringinfo.h"
#include "libpq/pqformat.h"
#include "utils/memutils.h"
#include "pg_config.h"
typedef enum
{
/* pagestore_client -> pagestore */
T_ZenithExistsRequest = 0,
T_ZenithNblocksRequest,
T_ZenithGetPageRequest,
T_ZenithDbSizeRequest,
/* pagestore -> pagestore_client */
T_ZenithExistsResponse = 100,
T_ZenithNblocksResponse,
T_ZenithGetPageResponse,
T_ZenithErrorResponse,
T_ZenithDbSizeResponse,
} ZenithMessageTag;
/* base struct for c-style inheritance */
typedef struct
{
ZenithMessageTag tag;
} ZenithMessage;
#define messageTag(m) (((const ZenithMessage *)(m))->tag)
/*
* supertype of all the Zenith*Request structs below
*
* If 'latest' is true, we are requesting the latest page version, and 'lsn'
* is just a hint to the server that we know there are no versions of the page
* (or relation size, for exists/nblocks requests) later than the 'lsn'.
*/
typedef struct
{
ZenithMessageTag tag;
bool latest; /* if true, request latest page version */
XLogRecPtr lsn; /* request page version @ this LSN */
} ZenithRequest;
typedef struct
{
ZenithRequest req;
RelFileNode rnode;
ForkNumber forknum;
} ZenithExistsRequest;
typedef struct
{
ZenithRequest req;
RelFileNode rnode;
ForkNumber forknum;
} ZenithNblocksRequest;
typedef struct
{
ZenithRequest req;
Oid dbNode;
} ZenithDbSizeRequest;
typedef struct
{
ZenithRequest req;
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blkno;
} ZenithGetPageRequest;
/* supertype of all the Zenith*Response structs below */
typedef struct
{
ZenithMessageTag tag;
} ZenithResponse;
typedef struct
{
ZenithMessageTag tag;
bool exists;
} ZenithExistsResponse;
typedef struct
{
ZenithMessageTag tag;
uint32 n_blocks;
} ZenithNblocksResponse;
typedef struct
{
ZenithMessageTag tag;
char page[FLEXIBLE_ARRAY_MEMBER];
} ZenithGetPageResponse;
typedef struct
{
ZenithMessageTag tag;
int64 db_size;
} ZenithDbSizeResponse;
typedef struct
{
ZenithMessageTag tag;
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */
} ZenithErrorResponse;
extern StringInfoData zm_pack_request(ZenithRequest *msg);
extern ZenithResponse *zm_unpack_response(StringInfo s);
extern char *zm_to_string(ZenithMessage *msg);
/*
* API
*/
typedef struct
{
ZenithResponse *(*request) (ZenithRequest *request);
} page_server_api;
extern page_server_api *page_server;
extern char *page_server_connstring;
extern char *zenith_timeline;
extern char *zenith_tenant;
extern bool wal_redo;
extern int32 max_cluster_size;
extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
extern void smgr_init_zenith(void);
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
extern void smgr_init_inmem(void);
extern void smgr_shutdown_inmem(void);
/* zenith storage manager functionality */
extern void zenith_init(void);
extern void zenith_open(SMgrRelation reln);
extern void zenith_close(SMgrRelation reln, ForkNumber forknum);
extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum);
extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void zenith_extend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, char *buffer);
extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
extern const int64 zenith_dbsize(Oid dbNode);
extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
/* zenith wal-redo storage manager functionality */
extern void inmem_init(void);
extern void inmem_open(SMgrRelation reln);
extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum);
extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
char *buffer);
extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, char *buffer, bool skipFsync);
extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber nblocks);
extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
/* utils for zenith relsize cache */
extern void relsize_hash_init(void);
extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
#endif

1696
pgxn/neon/pagestore_smgr.c Normal file

File diff suppressed because it is too large Load Diff

167
pgxn/neon/relsize_cache.c Normal file
View File

@@ -0,0 +1,167 @@
/*-------------------------------------------------------------------------
*
* relsize_cache.c
* Relation size cache for better zentih performance.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* contrib/neon/relsize_cache.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "pagestore_client.h"
#include "storage/relfilenode.h"
#include "storage/smgr.h"
#include "storage/lwlock.h"
#include "storage/ipc.h"
#include "storage/shmem.h"
#include "catalog/pg_tablespace_d.h"
#include "utils/dynahash.h"
#include "utils/guc.h"
typedef struct
{
RelFileNode rnode;
ForkNumber forknum;
} RelTag;
typedef struct
{
RelTag tag;
BlockNumber size;
} RelSizeEntry;
static HTAB *relsize_hash;
static LWLockId relsize_lock;
static int relsize_hash_size;
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
/*
* Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
* which seems reasonable.
*/
#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
static void
zenith_smgr_shmem_startup(void)
{
static HASHCTL info;
if (prev_shmem_startup_hook)
prev_shmem_startup_hook();
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
info.keysize = sizeof(RelTag);
info.entrysize = sizeof(RelSizeEntry);
relsize_hash = ShmemInitHash("neon_relsize",
relsize_hash_size, relsize_hash_size,
&info,
HASH_ELEM | HASH_BLOBS);
LWLockRelease(AddinShmemInitLock);
}
bool
get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
{
bool found = false;
if (relsize_hash_size > 0)
{
RelTag tag;
RelSizeEntry *entry;
tag.rnode = rnode;
tag.forknum = forknum;
LWLockAcquire(relsize_lock, LW_SHARED);
entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
if (entry != NULL)
{
*size = entry->size;
found = true;
}
LWLockRelease(relsize_lock);
}
return found;
}
void
set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
{
if (relsize_hash_size > 0)
{
RelTag tag;
RelSizeEntry *entry;
tag.rnode = rnode;
tag.forknum = forknum;
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
entry->size = size;
LWLockRelease(relsize_lock);
}
}
void
update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
{
if (relsize_hash_size > 0)
{
RelTag tag;
RelSizeEntry *entry;
bool found;
tag.rnode = rnode;
tag.forknum = forknum;
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
if (!found || entry->size < size)
entry->size = size;
LWLockRelease(relsize_lock);
}
}
void
forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
{
if (relsize_hash_size > 0)
{
RelTag tag;
tag.rnode = rnode;
tag.forknum = forknum;
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
LWLockRelease(relsize_lock);
}
}
void
relsize_hash_init(void)
{
DefineCustomIntVariable("neon.relsize_hash_size",
"Sets the maximum number of cached relation sizes for neon",
NULL,
&relsize_hash_size,
DEFAULT_RELSIZE_HASH_SIZE,
0,
INT_MAX,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
if (relsize_hash_size > 0)
{
RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
RequestNamedLWLockTranche("neon_relsize", 1);
prev_shmem_startup_hook = shmem_startup_hook;
shmem_startup_hook = zenith_smgr_shmem_startup;
}
}

2403
pgxn/neon/walproposer.c Normal file

File diff suppressed because it is too large Load Diff

540
pgxn/neon/walproposer.h Normal file
View File

@@ -0,0 +1,540 @@
#ifndef __NEON_WALPROPOSER_H__
#define __NEON_WALPROPOSER_H__
#include "access/xlogdefs.h"
#include "postgres.h"
#include "port.h"
#include "access/xlog_internal.h"
#include "access/transam.h"
#include "nodes/replnodes.h"
#include "utils/uuid.h"
#include "replication/walreceiver.h"
#define SK_MAGIC 0xCafeCeefu
#define SK_PROTOCOL_VERSION 2
#define MAX_SAFEKEEPERS 32
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */
#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */
#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */
/*
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
* because all WL_* events are given flags equal to some (1 << i), starting from i = 0
*/
#define WL_NO_EVENTS 0
extern char* wal_acceptors_list;
extern int wal_acceptor_reconnect_timeout;
extern int wal_acceptor_connect_timeout;
extern bool am_wal_proposer;
struct WalProposerConn; /* Defined in libpqwalproposer */
typedef struct WalProposerConn WalProposerConn;
struct WalMessage;
typedef struct WalMessage WalMessage;
extern char *zenith_timeline_walproposer;
extern char *zenith_tenant_walproposer;
/* Possible return values from ReadPGAsync */
typedef enum
{
/* The full read was successful. buf now points to the data */
PG_ASYNC_READ_SUCCESS,
/* The read is ongoing. Wait until the connection is read-ready, then try
* again. */
PG_ASYNC_READ_TRY_AGAIN,
/* Reading failed. Check PQerrorMessage(conn) */
PG_ASYNC_READ_FAIL,
} PGAsyncReadResult;
/* Possible return values from WritePGAsync */
typedef enum
{
/* The write fully completed */
PG_ASYNC_WRITE_SUCCESS,
/* The write started, but you'll need to call PQflush some more times
* to finish it off. We just tried, so it's best to wait until the
* connection is read- or write-ready to try again.
*
* If it becomes read-ready, call PQconsumeInput and flush again. If it
* becomes write-ready, just call PQflush.
*/
PG_ASYNC_WRITE_TRY_FLUSH,
/* Writing failed. Check PQerrorMessage(conn) */
PG_ASYNC_WRITE_FAIL,
} PGAsyncWriteResult;
/*
* WAL safekeeper state, which is used to wait for some event.
*
* States are listed here in the order that they're executed.
*
* Most states, upon failure, will move back to SS_OFFLINE by calls to
* ResetConnection or ShutdownConnection.
*/
typedef enum
{
/*
* Does not have an active connection and will stay that way until
* further notice.
*
* Moves to SS_CONNECTING_WRITE by calls to ResetConnection.
*/
SS_OFFLINE,
/*
* Connecting states. "_READ" waits for the socket to be available for
* reading, "_WRITE" waits for writing. There's no difference in the code
* they execute when polled, but we have this distinction in order to
* recreate the event set in HackyRemoveWalProposerEvent.
*
* After the connection is made, "START_WAL_PUSH" query is sent.
*/
SS_CONNECTING_WRITE,
SS_CONNECTING_READ,
/*
* Waiting for the result of the "START_WAL_PUSH" command.
*
* After we get a successful result, sends handshake to safekeeper.
*/
SS_WAIT_EXEC_RESULT,
/*
* Executing the receiving half of the handshake. After receiving, moves to
* SS_VOTING.
*/
SS_HANDSHAKE_RECV,
/*
* Waiting to participate in voting, but a quorum hasn't yet been reached.
* This is an idle state - we do not expect AdvancePollState to be called.
*
* Moved externally by execution of SS_HANDSHAKE_RECV, when we received a
* quorum of handshakes.
*/
SS_VOTING,
/*
* Already sent voting information, waiting to receive confirmation from the
* node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet.
*/
SS_WAIT_VERDICT,
/* Need to flush ProposerElected message. */
SS_SEND_ELECTED_FLUSH,
/*
* Waiting for quorum to send WAL. Idle state. If the socket becomes
* read-ready, the connection has been closed.
*
* Moves to SS_ACTIVE only by call to StartStreaming.
*/
SS_IDLE,
/*
* Active phase, when we acquired quorum and have WAL to send or feedback
* to read.
*/
SS_ACTIVE,
} SafekeeperState;
/* Consensus logical timestamp. */
typedef uint64 term_t;
/* neon storage node id */
typedef uint64 NNodeId;
/*
* Proposer <-> Acceptor messaging.
*/
/* Initial Proposer -> Acceptor message */
typedef struct ProposerGreeting
{
uint64 tag; /* message tag */
uint32 protocolVersion; /* proposer-safekeeper protocol version */
uint32 pgVersion;
pg_uuid_t proposerId;
uint64 systemId; /* Postgres system identifier */
uint8 ztimelineid[16]; /* Zenith timeline id */
uint8 ztenantid[16];
TimeLineID timeline;
uint32 walSegSize;
} ProposerGreeting;
typedef struct AcceptorProposerMessage
{
uint64 tag;
} AcceptorProposerMessage;
/*
* Acceptor -> Proposer initial response: the highest term acceptor voted for.
*/
typedef struct AcceptorGreeting
{
AcceptorProposerMessage apm;
term_t term;
NNodeId nodeId;
} AcceptorGreeting;
/*
* Proposer -> Acceptor vote request.
*/
typedef struct VoteRequest
{
uint64 tag;
term_t term;
pg_uuid_t proposerId; /* for monitoring/debugging */
} VoteRequest;
/* Element of term switching chain. */
typedef struct TermSwitchEntry
{
term_t term;
XLogRecPtr lsn;
} TermSwitchEntry;
typedef struct TermHistory
{
uint32 n_entries;
TermSwitchEntry *entries;
} TermHistory;
/* Vote itself, sent from safekeeper to proposer */
typedef struct VoteResponse {
AcceptorProposerMessage apm;
term_t term;
uint64 voteGiven;
/*
* Safekeeper flush_lsn (end of WAL) + history of term switches allow
* proposer to choose the most advanced one.
*/
XLogRecPtr flushLsn;
XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */
TermHistory termHistory;
XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
} VoteResponse;
/*
* Proposer -> Acceptor message announcing proposer is elected and communicating
* epoch history to it.
*/
typedef struct ProposerElected
{
uint64 tag;
term_t term;
/* proposer will send since this point */
XLogRecPtr startStreamingAt;
/* history of term switches up to this proposer */
TermHistory *termHistory;
/* timeline globally starts at this LSN */
XLogRecPtr timelineStartLsn;
} ProposerElected;
/*
* Header of request with WAL message sent from proposer to safekeeper.
*/
typedef struct AppendRequestHeader
{
uint64 tag;
term_t term; /* term of the proposer */
/*
* LSN since which current proposer appends WAL (begin_lsn of its first
* record); determines epoch switch point.
*/
XLogRecPtr epochStartLsn;
XLogRecPtr beginLsn; /* start position of message in WAL */
XLogRecPtr endLsn; /* end position of message in WAL */
XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */
/*
* minimal LSN which may be needed for recovery of some safekeeper (end lsn
* + 1 of last chunk streamed to everyone)
*/
XLogRecPtr truncateLsn;
pg_uuid_t proposerId; /* for monitoring/debugging */
} AppendRequestHeader;
/*
* Hot standby feedback received from replica
*/
typedef struct HotStandbyFeedback
{
TimestampTz ts;
FullTransactionId xmin;
FullTransactionId catalog_xmin;
} HotStandbyFeedback;
typedef struct ReplicationFeedback
{
// current size of the timeline on pageserver
uint64 currentClusterSize;
// standby_status_update fields that safekeeper received from pageserver
XLogRecPtr ps_writelsn;
XLogRecPtr ps_flushlsn;
XLogRecPtr ps_applylsn;
TimestampTz ps_replytime;
} ReplicationFeedback;
typedef struct WalproposerShmemState
{
slock_t mutex;
ReplicationFeedback feedback;
term_t mineLastElectedTerm;
} WalproposerShmemState;
/*
* Report safekeeper state to proposer
*/
typedef struct AppendResponse
{
AcceptorProposerMessage apm;
/*
* Current term of the safekeeper; if it is higher than proposer's, the
* compute is out of date.
*/
term_t term;
// TODO: add comment
XLogRecPtr flushLsn;
// Safekeeper reports back his awareness about which WAL is committed, as
// this is a criterion for walproposer --sync mode exit
XLogRecPtr commitLsn;
HotStandbyFeedback hs;
// Feedback recieved from pageserver includes standby_status_update fields
// and custom zenith feedback.
// This part of the message is extensible.
ReplicationFeedback rf;
} AppendResponse;
// ReplicationFeedback is extensible part of the message that is parsed separately
// Other fields are fixed part
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
/*
* Descriptor of safekeeper
*/
typedef struct Safekeeper
{
char const* host;
char const* port;
char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
/*
* postgres protocol connection to the WAL acceptor
*
* Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
* reach SS_ACTIVE; not before.
*/
WalProposerConn* conn;
/*
* Temporary buffer for the message being sent to the safekeeper.
*/
StringInfoData outbuf;
/*
* WAL reader, allocated for each safekeeper.
*/
XLogReaderState* xlogreader;
/*
* Streaming will start here; must be record boundary.
*/
XLogRecPtr startStreamingAt;
bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */
XLogRecPtr streamingAt; /* current streaming position */
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
int eventPos; /* position in wait event set. Equal to -1 if no event */
SafekeeperState state; /* safekeeper state machine state */
TimestampTz startedConnAt; /* when connection attempt started */
AcceptorGreeting greetResponse; /* acceptor greeting */
VoteResponse voteResponse; /* the vote */
AppendResponse appendResponse; /* feedback for master */
} Safekeeper;
extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
void WalProposerPoll(void);
void WalProposerRegister(void);
void ParseReplicationFeedbackMessage(StringInfo reply_message,
ReplicationFeedback *rf);
extern void StartProposerReplication(StartReplicationCmd *cmd);
Size WalproposerShmemSize(void);
bool WalproposerShmemInit(void);
void replication_feedback_set(ReplicationFeedback *rf);
void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
/* libpqwalproposer hooks & helper type */
/* Re-exported PostgresPollingStatusType */
typedef enum
{
WP_CONN_POLLING_FAILED = 0,
WP_CONN_POLLING_READING,
WP_CONN_POLLING_WRITING,
WP_CONN_POLLING_OK,
/*
* 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
* We've removed it here to avoid clutter.
*/
} WalProposerConnectPollStatusType;
/* Re-exported and modified ExecStatusType */
typedef enum
{
/* We received a single CopyBoth result */
WP_EXEC_SUCCESS_COPYBOTH,
/* Any success result other than a single CopyBoth was received. The specifics of the result
* were already logged, but it may be useful to provide an error message indicating which
* safekeeper messed up.
*
* Do not expect PQerrorMessage to be appropriately set. */
WP_EXEC_UNEXPECTED_SUCCESS,
/* No result available at this time. Wait until read-ready, then call again. Internally, this is
* returned when PQisBusy indicates that PQgetResult would block. */
WP_EXEC_NEEDS_INPUT,
/* Catch-all failure. Check PQerrorMessage. */
WP_EXEC_FAILED,
} WalProposerExecStatusType;
/* Re-exported ConnStatusType */
typedef enum
{
WP_CONNECTION_OK,
WP_CONNECTION_BAD,
/*
* The original ConnStatusType has many more tags, but requests that
* they not be relied upon (except for displaying to the user). We
* don't need that extra functionality, so we collect them into a
* single tag here.
*/
WP_CONNECTION_IN_PROGRESS,
} WalProposerConnStatusType;
/* Re-exported PQerrorMessage */
typedef char* (*walprop_error_message_fn) (WalProposerConn* conn);
/* Re-exported PQstatus */
typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn);
/* Re-exported PQconnectStart */
typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
/* Re-exported PQconectPoll */
typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
/* Blocking wrapper around PQsendQuery */
typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
/* Re-exported PQsocket */
typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
typedef int (*walprop_flush_fn) (WalProposerConn* conn);
/* Re-exported PQfinish */
typedef void (*walprop_finish_fn) (WalProposerConn* conn);
/*
* Ergonomic wrapper around PGgetCopyData
*
* Reads a CopyData block from a safekeeper, setting *amount to the number
* of bytes returned.
*
* This function is allowed to assume certain properties specific to the
* protocol with the safekeepers, so it should not be used as-is for any
* other purpose.
*
* Note: If possible, using <AsyncRead> is generally preferred, because it
* performs a bit of extra checking work that's always required and is normally
* somewhat verbose.
*/
typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
char** buf,
int* amount);
/*
* Ergonomic wrapper around PQputCopyData + PQflush
*
* Starts to write a CopyData block to a safekeeper.
*
* For information on the meaning of return codes, refer to PGAsyncWriteResult.
*/
typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
void const* buf,
size_t size);
/*
* Blocking equivalent to walprop_async_write_fn
*
* Returns 'true' if successful, 'false' on failure.
*/
typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size);
/* All libpqwalproposer exported functions collected together. */
typedef struct WalProposerFunctionsType
{
walprop_error_message_fn walprop_error_message;
walprop_status_fn walprop_status;
walprop_connect_start_fn walprop_connect_start;
walprop_connect_poll_fn walprop_connect_poll;
walprop_send_query_fn walprop_send_query;
walprop_get_query_result_fn walprop_get_query_result;
walprop_socket_fn walprop_socket;
walprop_flush_fn walprop_flush;
walprop_finish_fn walprop_finish;
walprop_async_read_fn walprop_async_read;
walprop_async_write_fn walprop_async_write;
walprop_blocking_write_fn walprop_blocking_write;
} WalProposerFunctionsType;
/* Allow the above functions to be "called" with normal syntax */
#define walprop_error_message(conn) \
WalProposerFunctions->walprop_error_message(conn)
#define walprop_status(conn) \
WalProposerFunctions->walprop_status(conn)
#define walprop_connect_start(conninfo) \
WalProposerFunctions->walprop_connect_start(conninfo)
#define walprop_connect_poll(conn) \
WalProposerFunctions->walprop_connect_poll(conn)
#define walprop_send_query(conn, query) \
WalProposerFunctions->walprop_send_query(conn, query)
#define walprop_get_query_result(conn) \
WalProposerFunctions->walprop_get_query_result(conn)
#define walprop_set_nonblocking(conn, arg) \
WalProposerFunctions->walprop_set_nonblocking(conn, arg)
#define walprop_socket(conn) \
WalProposerFunctions->walprop_socket(conn)
#define walprop_flush(conn) \
WalProposerFunctions->walprop_flush(conn)
#define walprop_finish(conn) \
WalProposerFunctions->walprop_finish(conn)
#define walprop_async_read(conn, buf, amount) \
WalProposerFunctions->walprop_async_read(conn, buf, amount)
#define walprop_async_write(conn, buf, size) \
WalProposerFunctions->walprop_async_write(conn, buf, size)
#define walprop_blocking_write(conn, buf, size) \
WalProposerFunctions->walprop_blocking_write(conn, buf, size)
/*
* The runtime location of the libpqwalproposer functions.
*
* This pointer is set by the initializer in libpqwalproposer, so that we
* can use it later.
*/
extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions;
#endif /* __NEON_WALPROPOSER_H__ */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
#ifndef __NEON_WALPROPOSER_UTILS_H__
#define __NEON_WALPROPOSER_UTILS_H__
#include "walproposer.h"
int CompareLsn(const void *a, const void *b);
char* FormatSafekeeperState(SafekeeperState state);
void AssertEventsOkForState(uint32 events, Safekeeper* sk);
uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
char* FormatEvents(uint32 events);
bool HexDecodeString(uint8 *result, char *input, int nbytes);
uint32 pq_getmsgint32_le(StringInfo msg);
uint64 pq_getmsgint64_le(StringInfo msg);
void pq_sendint32_le(StringInfo buf, uint32 i);
void pq_sendint64_le(StringInfo buf, uint64 i);
void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
void XLogWalPropClose(XLogRecPtr recptr);
#endif /* __NEON_WALPROPOSER_UTILS_H__ */

View File

@@ -0,0 +1,15 @@
# pgxs/neon_test_utils/Makefile
MODULE_big = neon_test_utils
OBJS = \
$(WIN32RES) \
neontest.o
EXTENSION = neon_test_utils
DATA = neon_test_utils--1.0.sql
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)

View File

@@ -0,0 +1,29 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit
CREATE FUNCTION test_consume_xids(nxids int)
RETURNS VOID
AS 'MODULE_PATHNAME', 'test_consume_xids'
LANGUAGE C STRICT
PARALLEL UNSAFE;
CREATE FUNCTION clear_buffer_cache()
RETURNS VOID
AS 'MODULE_PATHNAME', 'clear_buffer_cache'
LANGUAGE C STRICT
PARALLEL UNSAFE;
CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
RETURNS bytea
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
LANGUAGE C PARALLEL UNSAFE;
CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
RETURNS bytea
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
LANGUAGE C PARALLEL UNSAFE;
CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
RETURNS VOID
AS 'MODULE_PATHNAME', 'neon_xlogflush'
LANGUAGE C PARALLEL UNSAFE;

View File

@@ -0,0 +1,5 @@
# neon_test_utils extension
comment = 'helpers for neon testing and debugging'
default_version = '1.0'
module_pathname = '$libdir/neon_test_utils'
relocatable = true

View File

@@ -0,0 +1,304 @@
/*-------------------------------------------------------------------------
*
* neontest.c
* Helpers for neon testing and debugging
*
* IDENTIFICATION
* contrib/neon_test_utils/neontest.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/relation.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/namespace.h"
#include "fmgr.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "utils/builtins.h"
#include "utils/pg_lsn.h"
#include "utils/rel.h"
#include "utils/varlena.h"
#include "../neon/pagestore_client.h"
PG_MODULE_MAGIC;
extern void _PG_init(void);
PG_FUNCTION_INFO_V1(test_consume_xids);
PG_FUNCTION_INFO_V1(clear_buffer_cache);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
PG_FUNCTION_INFO_V1(neon_xlogflush);
/*
* Linkage to functions in zenith module.
* The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
*/
typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
XLogRecPtr request_lsn, bool request_latest, char *buffer);
static zenith_read_at_lsn_type zenith_read_at_lsn_ptr;
/*
* Module initialize function: fetch function pointers for cross-module calls.
*/
void
_PG_init(void)
{
/* Asserts verify that typedefs above match original declarations */
AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
load_external_function("$libdir/neon", "zenith_read_at_lsn",
true, NULL);
}
#define zenith_read_at_lsn zenith_read_at_lsn_ptr
/*
* test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
*/
Datum
test_consume_xids(PG_FUNCTION_ARGS)
{
int32 nxids = PG_GETARG_INT32(0);
TransactionId topxid;
FullTransactionId fullxid;
TransactionId xid;
TransactionId targetxid;
/* make sure we have a top-XID first */
topxid = GetTopTransactionId();
xid = ReadNextTransactionId();
targetxid = xid + nxids;
while (targetxid < FirstNormalTransactionId)
targetxid++;
while (TransactionIdPrecedes(xid, targetxid))
{
fullxid = GetNewTransactionId(true);
xid = XidFromFullTransactionId(fullxid);
elog(DEBUG1, "topxid: %u xid: %u", topxid, xid);
}
PG_RETURN_VOID();
}
/*
* Flush the buffer cache, evicting all pages that are not currently pinned.
*/
Datum
clear_buffer_cache(PG_FUNCTION_ARGS)
{
bool save_zenith_test_evict;
/*
* Temporarily set the zenith_test_evict GUC, so that when we pin and
* unpin a buffer, the buffer is evicted. We use that hack to evict all
* buffers, as there is no explicit "evict this buffer" function in the
* buffer manager.
*/
save_zenith_test_evict = zenith_test_evict;
zenith_test_evict = true;
PG_TRY();
{
/* Scan through all the buffers */
for (int i = 0; i < NBuffers; i++)
{
BufferDesc *bufHdr;
uint32 buf_state;
Buffer bufferid;
bool isvalid;
RelFileNode rnode;
ForkNumber forknum;
BlockNumber blocknum;
/* Peek into the buffer header to see what page it holds. */
bufHdr = GetBufferDescriptor(i);
buf_state = LockBufHdr(bufHdr);
if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
isvalid = true;
else
isvalid = false;
bufferid = BufferDescriptorGetBuffer(bufHdr);
rnode = bufHdr->tag.rnode;
forknum = bufHdr->tag.forkNum;
blocknum = bufHdr->tag.blockNum;
UnlockBufHdr(bufHdr, buf_state);
/*
* Pin the buffer, and release it again. Because we have
* zenith_test_evict==true, this will evict the page from
* the buffer cache if no one else is holding a pin on it.
*/
if (isvalid)
{
if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid))
ReleaseBuffer(bufferid);
}
}
}
PG_FINALLY();
{
/* restore the GUC */
zenith_test_evict = save_zenith_test_evict;
}
PG_END_TRY();
PG_RETURN_VOID();
}
/*
* Reads the page from page server without buffer cache
* usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN
* NULL read lsn will result in reading the latest version.
*
* Note: reading latest version will result in waiting for latest changes to reach the page server,
* if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
*/
Datum
get_raw_page_at_lsn(PG_FUNCTION_ARGS)
{
bytea *raw_page;
ForkNumber forknum;
RangeVar *relrv;
Relation rel;
char *raw_page_data;
text *relname;
text *forkname;
uint32 blkno;
bool request_latest = PG_ARGISNULL(3);
uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
PG_RETURN_NULL();
relname = PG_GETARG_TEXT_PP(0);
forkname = PG_GETARG_TEXT_PP(1);
blkno = PG_GETARG_UINT32(2);
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use raw page functions")));
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
rel = relation_openrv(relrv, AccessShareLock);
/* Check that this relation has storage */
if (rel->rd_rel->relkind == RELKIND_VIEW)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot get raw page from view \"%s\"",
RelationGetRelationName(rel))));
if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot get raw page from composite type \"%s\"",
RelationGetRelationName(rel))));
if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot get raw page from foreign table \"%s\"",
RelationGetRelationName(rel))));
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot get raw page from partitioned table \"%s\"",
RelationGetRelationName(rel))));
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
ereport(ERROR,
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
errmsg("cannot get raw page from partitioned index \"%s\"",
RelationGetRelationName(rel))));
/*
* Reject attempts to read non-local temporary relations; we would be
* likely to get wrong data since we have no visibility into the owning
* session's local buffers.
*/
if (RELATION_IS_OTHER_TEMP(rel))
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("cannot access temporary tables of other sessions")));
forknum = forkname_to_number(text_to_cstring(forkname));
/* Initialize buffer to copy to */
raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
raw_page_data = VARDATA(raw_page);
zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
relation_close(rel, AccessShareLock);
PG_RETURN_BYTEA_P(raw_page);
}
/*
* Another option to read a relation page from page server without cache
* this version doesn't validate input and allows reading blocks of dropped relations
*
* Note: reading latest version will result in waiting for latest changes to reach the page server,
* if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
*/
Datum
get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
{
char *raw_page_data;
if (!superuser())
ereport(ERROR,
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
errmsg("must be superuser to use raw page functions")));
if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) ||
PG_ARGISNULL(3) || PG_ARGISNULL(4))
PG_RETURN_NULL();
{
RelFileNode rnode = {
.spcNode = PG_GETARG_OID(0),
.dbNode = PG_GETARG_OID(1),
.relNode = PG_GETARG_OID(2)
};
ForkNumber forknum = PG_GETARG_UINT32(3);
uint32 blkno = PG_GETARG_UINT32(4);
bool request_latest = PG_ARGISNULL(5);
uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
/* Initialize buffer to copy to */
bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
raw_page_data = VARDATA(raw_page);
zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data);
PG_RETURN_BYTEA_P(raw_page);
}
}
/*
* Directly calls XLogFlush(lsn) to flush WAL buffers.
*/
Datum
neon_xlogflush(PG_FUNCTION_ARGS)
{
XLogRecPtr lsn = PG_GETARG_LSN(0);
XLogFlush(lsn);
PG_RETURN_VOID();
}

294
poetry.lock generated

File diff suppressed because one or more lines are too long

View File

@@ -1,11 +1,10 @@
#!/usr/bin/env python3
from typing import List
import argparse
import enum
import subprocess
import sys
import enum
import argparse
import os
from typing import List
@enum.unique
@@ -37,15 +36,24 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
return cmd
def yapf(fix_inplace: bool) -> str:
cmd = "poetry run yapf --recursive"
if fix_inplace:
cmd += " --in-place"
else:
cmd += " --diff"
def black(fix_inplace: bool) -> str:
cmd = "poetry run black"
if not fix_inplace:
cmd += " --diff --check"
return cmd
def isort(fix_inplace: bool) -> str:
cmd = "poetry run isort"
if not fix_inplace:
cmd += " --diff --check"
return cmd
def flake8() -> str:
return "poetry run flake8"
def mypy() -> str:
return "poetry run mypy"
@@ -71,11 +79,13 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
else:
print("Please inspect the output below and run make fmt to fix automatically.")
if suffix == ".py":
print("If the output is empty, ensure that you've installed Python tooling by\n"
"running './scripts/pysync' in the current directory (no root needed)")
print(
"If the output is empty, ensure that you've installed Python tooling by\n"
"running './scripts/pysync' in the current directory (no root needed)"
)
print()
print(res.stdout.decode())
exit(1)
sys.exit(1)
print(colorify("[OK]", Color.GREEN, no_color))
@@ -83,10 +93,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace")
parser.add_argument("--no-color",
action="store_true",
help="disable colored output",
default=not sys.stdout.isatty())
parser.add_argument(
"--no-color",
action="store_true",
help="disable colored output",
default=not sys.stdout.isatty(),
)
args = parser.parse_args()
files = get_commit_files()
@@ -101,9 +113,23 @@ if __name__ == "__main__":
no_color=args.no_color,
)
check(
name="yapf",
name="isort",
suffix=".py",
cmd=yapf(fix_inplace=args.fix_inplace),
cmd=isort(fix_inplace=args.fix_inplace),
changed_files=files,
no_color=args.no_color,
)
check(
name="black",
suffix=".py",
cmd=black(fix_inplace=args.fix_inplace),
changed_files=files,
no_color=args.no_color,
)
check(
name="flake8",
suffix=".py",
cmd=flake8(),
changed_files=files,
no_color=args.no_color,
)

View File

@@ -11,10 +11,11 @@ bstr = "0.2.17"
bytes = { version = "1.0.1", features = ['serde'] }
clap = "3.0"
futures = "0.3.13"
hashbrown = "0.11.2"
hashbrown = "0.12"
hex = "0.4.3"
hmac = "0.12.1"
hyper = "0.14"
itertools = "0.10.3"
once_cell = "1.13.0"
md5 = "0.7.0"
parking_lot = "0.12"
@@ -23,7 +24,7 @@ rand = "0.8.3"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
routerify = "3"
rustls = "0.20.0"
rustls-pemfile = "0.2.1"
rustls-pemfile = "1"
scopeguard = "1.1.0"
serde = "1"
serde_json = "1"

View File

@@ -127,7 +127,7 @@ impl<T, E> BackendType<Result<T, E>> {
}
}
impl BackendType<ClientCredentials> {
impl BackendType<ClientCredentials<'_>> {
/// Authenticate the client via the requested backend, possibly using credentials.
pub async fn authenticate(
mut self,
@@ -149,7 +149,7 @@ impl BackendType<ClientCredentials> {
// Finally we may finish the initialization of `creds`.
// TODO: add missing type safety to ClientCredentials.
creds.project = Some(payload.project);
creds.project = Some(payload.project.into());
let mut config = match &self {
Console(creds) => {

View File

@@ -121,7 +121,7 @@ pub enum AuthInfo {
#[must_use]
pub(super) struct Api<'a> {
endpoint: &'a ApiUrl,
creds: &'a ClientCredentials,
creds: &'a ClientCredentials<'a>,
}
impl<'a> Api<'a> {
@@ -143,7 +143,7 @@ impl<'a> Api<'a> {
url.path_segments_mut().push("proxy_get_role_secret");
url.query_pairs_mut()
.append_pair("project", self.creds.project().expect("impossible"))
.append_pair("role", &self.creds.user);
.append_pair("role", self.creds.user);
// TODO: use a proper logger
println!("cplane request: {url}");
@@ -187,8 +187,8 @@ impl<'a> Api<'a> {
config
.host(host)
.port(port)
.dbname(&self.creds.dbname)
.user(&self.creds.user);
.dbname(self.creds.dbname)
.user(self.creds.user);
Ok(config)
}

View File

@@ -56,7 +56,7 @@ enum ProxyAuthResponse {
NotReady { ready: bool }, // TODO: get rid of `ready`
}
impl ClientCredentials {
impl ClientCredentials<'_> {
fn is_existing_user(&self) -> bool {
self.user.ends_with("@zenith")
}
@@ -64,15 +64,15 @@ impl ClientCredentials {
async fn authenticate_proxy_client(
auth_endpoint: &reqwest::Url,
creds: &ClientCredentials,
creds: &ClientCredentials<'_>,
md5_response: &str,
salt: &[u8; 4],
psql_session_id: &str,
) -> Result<DatabaseInfo, LegacyAuthError> {
let mut url = auth_endpoint.clone();
url.query_pairs_mut()
.append_pair("login", &creds.user)
.append_pair("database", &creds.dbname)
.append_pair("login", creds.user)
.append_pair("database", creds.dbname)
.append_pair("md5response", md5_response)
.append_pair("salt", &hex::encode(salt))
.append_pair("psql_session_id", psql_session_id);
@@ -103,7 +103,7 @@ async fn authenticate_proxy_client(
async fn handle_existing_user(
auth_endpoint: &reqwest::Url,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
creds: &ClientCredentials,
creds: &ClientCredentials<'_>,
) -> auth::Result<compute::NodeInfo> {
let psql_session_id = super::link::new_psql_session_id();
let md5_salt = rand::random();
@@ -136,7 +136,7 @@ async fn handle_existing_user(
pub async fn handle_user(
auth_endpoint: &reqwest::Url,
auth_link_uri: &reqwest::Url,
creds: &ClientCredentials,
creds: &ClientCredentials<'_>,
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
) -> auth::Result<compute::NodeInfo> {
if creds.is_existing_user() {

View File

@@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
#[must_use]
pub(super) struct Api<'a> {
endpoint: &'a ApiUrl,
creds: &'a ClientCredentials,
creds: &'a ClientCredentials<'a>,
}
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
@@ -87,8 +87,8 @@ impl<'a> Api<'a> {
config
.host(self.endpoint.host_str().unwrap_or("localhost"))
.port(self.endpoint.port().unwrap_or(5432))
.dbname(&self.creds.dbname)
.user(&self.creds.user);
.dbname(self.creds.dbname)
.user(self.creds.user);
Ok(config)
}

View File

@@ -1,6 +1,7 @@
//! User credentials used in authentication.
use crate::error::UserFacingError;
use std::borrow::Cow;
use thiserror::Error;
use utils::pq_proto::StartupMessageParams;
@@ -27,51 +28,59 @@ impl UserFacingError for ClientCredsParseError {}
/// Various client credentials which we use for authentication.
/// Note that we don't store any kind of client key or password here.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ClientCredentials {
pub user: String,
pub dbname: String,
pub project: Option<String>,
pub struct ClientCredentials<'a> {
pub user: &'a str,
pub dbname: &'a str,
pub project: Option<Cow<'a, str>>,
}
impl ClientCredentials {
impl ClientCredentials<'_> {
pub fn project(&self) -> Option<&str> {
self.project.as_deref()
}
}
impl ClientCredentials {
impl<'a> ClientCredentials<'a> {
pub fn parse(
mut options: StartupMessageParams,
params: &'a StartupMessageParams,
sni: Option<&str>,
common_name: Option<&str>,
) -> Result<Self, ClientCredsParseError> {
use ClientCredsParseError::*;
// Some parameters are absolutely necessary, others not so much.
let mut get_param = |key| options.remove(key).ok_or(MissingKey(key));
// Some parameters are stored in the startup message.
let get_param = |key| params.get(key).ok_or(MissingKey(key));
let user = get_param("user")?;
let dbname = get_param("database")?;
let project_a = get_param("project").ok();
// Project name might be passed via PG's command-line options.
let project_a = params.options_raw().and_then(|options| {
for opt in options {
if let Some(value) = opt.strip_prefix("project=") {
return Some(Cow::Borrowed(value));
}
}
None
});
// Alternative project name is in fact a subdomain from SNI.
// NOTE: we do not consider SNI if `common_name` is missing.
let project_b = sni
.zip(common_name)
.map(|(sni, cn)| {
// TODO: what if SNI is present but just a common name?
subdomain_from_sni(sni, cn)
.ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned()))
.ok_or_else(|| InconsistentSni(sni.into(), cn.into()))
.map(Cow::<'static, str>::Owned)
})
.transpose()?;
let project = match (project_a, project_b) {
// Invariant: if we have both project name variants, they should match.
(Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))),
(a, b) => a.or(b).map(|name| {
// Invariant: project name may not contain certain characters.
check_project_name(name).map_err(MalformedProjectName)
(Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a.into(), b.into()))),
// Invariant: project name may not contain certain characters.
(a, b) => a.or(b).map(|name| match project_name_valid(&name) {
false => Err(MalformedProjectName(name.into())),
true => Ok(name),
}),
}
.transpose()?;
@@ -84,12 +93,8 @@ impl ClientCredentials {
}
}
fn check_project_name(name: String) -> Result<String, String> {
if name.chars().all(|c| c.is_alphanumeric() || c == '-') {
Ok(name)
} else {
Err(name)
}
fn project_name_valid(name: &str) -> bool {
name.chars().all(|c| c.is_alphanumeric() || c == '-')
}
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
@@ -102,18 +107,14 @@ fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
mod tests {
use super::*;
fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams {
StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned())))
}
#[test]
#[ignore = "TODO: fix how database is handled"]
fn parse_bare_minimum() -> anyhow::Result<()> {
// According to postgresql, only `user` should be required.
let options = make_options([("user", "john_doe")]);
let options = StartupMessageParams::new([("user", "john_doe")]);
// TODO: check that `creds.dbname` is None.
let creds = ClientCredentials::parse(options, None, None)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
Ok(())
@@ -121,9 +122,9 @@ mod tests {
#[test]
fn parse_missing_project() -> anyhow::Result<()> {
let options = make_options([("user", "john_doe"), ("database", "world")]);
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
let creds = ClientCredentials::parse(options, None, None)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project, None);
@@ -133,12 +134,12 @@ mod tests {
#[test]
fn parse_project_from_sni() -> anyhow::Result<()> {
let options = make_options([("user", "john_doe"), ("database", "world")]);
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
let sni = Some("foo.localhost");
let common_name = Some("localhost");
let creds = ClientCredentials::parse(options, sni, common_name)?;
let creds = ClientCredentials::parse(&options, sni, common_name)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project.as_deref(), Some("foo"));
@@ -148,13 +149,13 @@ mod tests {
#[test]
fn parse_project_from_options() -> anyhow::Result<()> {
let options = make_options([
let options = StartupMessageParams::new([
("user", "john_doe"),
("database", "world"),
("project", "bar"),
("options", "-ckey=1 project=bar -c geqo=off"),
]);
let creds = ClientCredentials::parse(options, None, None)?;
let creds = ClientCredentials::parse(&options, None, None)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project.as_deref(), Some("bar"));
@@ -164,16 +165,16 @@ mod tests {
#[test]
fn parse_projects_identical() -> anyhow::Result<()> {
let options = make_options([
let options = StartupMessageParams::new([
("user", "john_doe"),
("database", "world"),
("project", "baz"),
("options", "project=baz"),
]);
let sni = Some("baz.localhost");
let common_name = Some("localhost");
let creds = ClientCredentials::parse(options, sni, common_name)?;
let creds = ClientCredentials::parse(&options, sni, common_name)?;
assert_eq!(creds.user, "john_doe");
assert_eq!(creds.dbname, "world");
assert_eq!(creds.project.as_deref(), Some("baz"));
@@ -183,17 +184,17 @@ mod tests {
#[test]
fn parse_projects_different() {
let options = make_options([
let options = StartupMessageParams::new([
("user", "john_doe"),
("database", "world"),
("project", "first"),
("options", "project=first"),
]);
let sni = Some("second.localhost");
let common_name = Some("localhost");
assert!(matches!(
ClientCredentials::parse(options, sni, common_name).expect_err("should fail"),
ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"),
ClientCredsParseError::InconsistentProjectNames(_, _)
));
}

View File

@@ -95,7 +95,7 @@ impl<'a> Session<'a> {
/// Store the cancel token for the given session.
/// This enables query cancellation in [`crate::proxy::handshake`].
pub fn enable_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
self.cancel_map
.0
.lock()

View File

@@ -1,9 +1,11 @@
use crate::{cancellation::CancelClosure, error::UserFacingError};
use futures::TryFutureExt;
use itertools::Itertools;
use std::{io, net::SocketAddr};
use thiserror::Error;
use tokio::net::TcpStream;
use tokio_postgres::NoTls;
use utils::pq_proto::StartupMessageParams;
#[derive(Debug, Error)]
pub enum ConnectionError {
@@ -110,7 +112,42 @@ pub struct PostgresConnection {
impl NodeInfo {
/// Connect to a corresponding compute node.
pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> {
pub async fn connect(
mut self,
params: &StartupMessageParams,
) -> Result<(PostgresConnection, CancelClosure), ConnectionError> {
if let Some(options) = params.options_raw() {
// We must drop all proxy-specific parameters.
#[allow(unstable_name_collisions)]
let options: String = options
.filter(|opt| !opt.starts_with("project="))
.intersperse(" ") // TODO: use impl from std once it's stabilized
.collect();
self.config.options(&options);
}
if let Some(app_name) = params.get("application_name") {
self.config.application_name(app_name);
}
if let Some(replication) = params.get("replication") {
use tokio_postgres::config::ReplicationMode;
match replication {
"true" | "on" | "yes" | "1" => {
self.config.replication_mode(ReplicationMode::Physical);
}
"database" => {
self.config.replication_mode(ReplicationMode::Logical);
}
_other => {}
}
}
// TODO: extend the list of the forwarded startup parameters.
// Currently, tokio-postgres doesn't allow us to pass
// arbitrary parameters, but the ones above are a good start.
let (socket_addr, mut stream) = self
.connect_raw()
.await

View File

@@ -1,6 +1,6 @@
use crate::auth;
use crate::cancellation::{self, CancelMap};
use crate::config::{ProxyConfig, TlsConfig};
use crate::config::{AuthUrls, ProxyConfig, TlsConfig};
use crate::stream::{MetricsStream, PqStream, Stream};
use anyhow::{bail, Context};
use futures::TryFutureExt;
@@ -93,20 +93,21 @@ async fn handle_client(
None => return Ok(()), // it's a cancellation request
};
// Extract credentials which we're going to use for auth.
let creds = {
let sni = stream.get_ref().sni_hostname();
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
let result = config
.auth_backend
.map(|_| auth::ClientCredentials::parse(params, sni, common_name))
.map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
.transpose();
async { result }.or_else(|e| stream.throw_error(e)).await?
};
let client = Client::new(stream, creds);
let client = Client::new(stream, creds, &params);
cancel_map
.with_session(|session| client.connect_to_db(config, session))
.with_session(|session| client.connect_to_db(&config.auth_urls, session))
.await
}
@@ -174,38 +175,57 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
}
/// Thin connection context.
struct Client<S> {
struct Client<'a, S> {
/// The underlying libpq protocol stream.
stream: PqStream<S>,
/// Client credentials that we care about.
creds: auth::BackendType<auth::ClientCredentials>,
creds: auth::BackendType<auth::ClientCredentials<'a>>,
/// KV-dictionary with PostgreSQL connection params.
params: &'a StartupMessageParams,
}
impl<S> Client<S> {
impl<'a, S> Client<'a, S> {
/// Construct a new connection context.
fn new(stream: PqStream<S>, creds: auth::BackendType<auth::ClientCredentials>) -> Self {
Self { stream, creds }
fn new(
stream: PqStream<S>,
creds: auth::BackendType<auth::ClientCredentials<'a>>,
params: &'a StartupMessageParams,
) -> Self {
Self {
stream,
creds,
params,
}
}
}
impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<S> {
impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
/// Let the client authenticate and connect to the designated compute node.
async fn connect_to_db(
self,
config: &ProxyConfig,
urls: &AuthUrls,
session: cancellation::Session<'_>,
) -> anyhow::Result<()> {
let Self { mut stream, creds } = self;
let Self {
mut stream,
creds,
params,
} = self;
// Authenticate and connect to a compute node.
let auth = creds.authenticate(&config.auth_urls, &mut stream).await;
let auth = creds.authenticate(urls, &mut stream).await;
let node = async { auth }.or_else(|e| stream.throw_error(e)).await?;
let reported_auth_ok = node.reported_auth_ok;
let (db, cancel_closure) = node.connect().or_else(|e| stream.throw_error(e)).await?;
let cancel_key_data = session.enable_cancellation(cancel_closure);
let (db, cancel_closure) = node
.connect(params)
.or_else(|e| stream.throw_error(e))
.await?;
let cancel_key_data = session.enable_query_cancellation(cancel_closure);
// Report authentication success if we haven't done this already.
if !node.reported_auth_ok {
if !reported_auth_ok {
stream
.write_message_noflush(&Be::AuthenticationOk)?
.write_message_noflush(&BeParameterStatusMessage::encoding())?;

View File

@@ -27,12 +27,54 @@ prometheus-client = "^0.14.1"
pytest-timeout = "^2.1.0"
Werkzeug = "2.1.2"
pytest-order = "^1.0.1"
allure-pytest = "^2.10.0"
pytest-asyncio = "^0.19.0"
[tool.poetry.dev-dependencies]
yapf = "==0.31.0"
flake8 = "^3.9.2"
flake8 = "^5.0.4"
mypy = "==0.971"
black = "^22.6.0"
isort = "^5.10.1"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.black]
line-length = 100
extend-exclude = '''
/(
vendor
)/
'''
[tool.isort]
profile = "black"
line_length = 100
skip_gitignore = true
skip = [
"vendor",
]
[tool.mypy]
# mypy uses regex
exclude = "^vendor/"
# some tests don't typecheck when this flag is set
check_untyped_defs = false
# Help mypy find imports when running against list of individual files.
# Without this line it would behave differently when executed on the entire project.
mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner"
disallow_incomplete_defs = false
disallow_untyped_calls = false
disallow_untyped_decorators = false
disallow_untyped_defs = false
strict = true
[[tool.mypy.overrides]]
module = [
"asyncpg.*",
"cached_property.*",
"pg8000.*",
]
ignore_missing_imports = true

View File

@@ -13,10 +13,10 @@
# avoid running regular linting script that checks every feature.
if [[ "$OSTYPE" == "darwin"* ]]; then
# no extra features to test currently, add more here when needed
cargo clippy --all --all-targets -- -A unknown_lints -D warnings
cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings
else
# * `-A unknown_lints` do not warn about unknown lint suppressions
# that people with newer toolchains might use
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
cargo clippy --all --all-targets --all-features -- -A unknown_lints -D warnings
cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
fi

View File

@@ -11,7 +11,6 @@ use anyhow::{bail, Context, Result};
use postgres_ffi::PG_TLI;
use regex::Regex;
use std::str::FromStr;
use std::sync::Arc;
use tracing::info;
use utils::{
@@ -67,18 +66,22 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
// ztenant id and ztimeline id are passed in connection string params
fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> {
if let FeStartupPacket::StartupMessage { params, .. } = sm {
self.ztenantid = match params.get("ztenantid") {
Some(z) => Some(ZTenantId::from_str(z)?), // just curious, can I do that from .map?
_ => None,
};
self.ztimelineid = match params.get("ztimelineid") {
Some(z) => Some(ZTimelineId::from_str(z)?),
_ => None,
};
if let Some(options) = params.options_raw() {
for opt in options {
match opt.split_once('=') {
Some(("ztenantid", value)) => {
self.ztenantid = Some(value.parse()?);
}
Some(("ztimelineid", value)) => {
self.ztimelineid = Some(value.parse()?);
}
_ => continue,
}
}
}
if let Some(app_name) = params.get("application_name") {
self.appname = Some(app_name.clone());
self.appname = Some(app_name.to_owned());
}
Ok(())

Some files were not shown because too many files have changed in this diff Show More