mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 22:10:39 +00:00
@@ -1,18 +1,18 @@
|
||||
**/.git/
|
||||
**/__pycache__
|
||||
**/.pytest_cache
|
||||
*
|
||||
|
||||
.git
|
||||
target
|
||||
tmp_check
|
||||
tmp_install
|
||||
tmp_check_cli
|
||||
test_output
|
||||
.vscode
|
||||
.neon
|
||||
integration_tests/.neon
|
||||
.mypy_cache
|
||||
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
!Cargo.toml
|
||||
!Cargo.lock
|
||||
!Makefile
|
||||
|
||||
!.cargo/
|
||||
!.config/
|
||||
!control_plane/
|
||||
!compute_tools/
|
||||
!libs/
|
||||
!pageserver/
|
||||
!pgxn/
|
||||
!proxy/
|
||||
!safekeeper/
|
||||
!vendor/postgres/
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
|
||||
1
.git-blame-ignore-revs
Normal file
1
.git-blame-ignore-revs
Normal file
@@ -0,0 +1 @@
|
||||
4c2bb43775947775401cbb9d774823c5723a91f8
|
||||
23
.github/ISSUE_TEMPLATE/bug-template.md
vendored
Normal file
23
.github/ISSUE_TEMPLATE/bug-template.md
vendored
Normal file
@@ -0,0 +1,23 @@
|
||||
---
|
||||
name: Bug Template
|
||||
about: Used for describing bugs
|
||||
title: ''
|
||||
labels: t/bug
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
## Steps to reproduce
|
||||
|
||||
|
||||
## Expected result
|
||||
|
||||
|
||||
## Actual result
|
||||
|
||||
|
||||
## Environment
|
||||
|
||||
|
||||
## Logs, links
|
||||
-
|
||||
25
.github/ISSUE_TEMPLATE/epic-template.md
vendored
Normal file
25
.github/ISSUE_TEMPLATE/epic-template.md
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
---
|
||||
name: Epic Template
|
||||
about: A set of related tasks contributing towards specific outcome, comprizing of
|
||||
more than 1 week of work.
|
||||
title: 'Epic: '
|
||||
labels: t/Epic
|
||||
assignees: ''
|
||||
|
||||
---
|
||||
|
||||
## Motivation
|
||||
|
||||
|
||||
## DoD
|
||||
|
||||
|
||||
## Implementation ideas
|
||||
|
||||
|
||||
## Tasks
|
||||
- [ ]
|
||||
|
||||
|
||||
## Other related tasks and Epics
|
||||
-
|
||||
20
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
Normal file
20
.github/PULL_REQUEST_TEMPLATE/release-pr.md
vendored
Normal file
@@ -0,0 +1,20 @@
|
||||
## Release 202Y-MM-DD
|
||||
|
||||
**NB: this PR must be merged only by 'Create a merge commit'!**
|
||||
|
||||
### Checklist when preparing for release
|
||||
- [ ] Read or refresh [the release flow guide](https://github.com/neondatabase/cloud/wiki/Release:-general-flow)
|
||||
- [ ] Ask in the [cloud Slack channel](https://neondb.slack.com/archives/C033A2WE6BZ) that you are going to rollout the release. Any blockers?
|
||||
- [ ] Does this release contain any db migrations? Destructive ones? What is the rollback plan?
|
||||
|
||||
<!-- List everything that should be done **before** release, any issues / setting changes / etc -->
|
||||
|
||||
### Checklist after release
|
||||
- [ ] Based on the merged commits write release notes and open a PR into `website` repo ([example](https://github.com/neondatabase/website/pull/120/files))
|
||||
- [ ] Check [#dev-production-stream](https://neondb.slack.com/archives/C03F5SM1N02) Slack channel
|
||||
- [ ] Check [stuck projects page](https://console.neon.tech/admin/projects?sort=last_active&order=desc&stuck=true)
|
||||
- [ ] Check [recent operation failures](https://console.neon.tech/admin/operations?action=create_timeline%2Cstart_compute%2Cstop_compute%2Csuspend_compute%2Capply_config%2Cdelete_timeline%2Cdelete_tenant%2Ccreate_branch%2Ccheck_availability&sort=updated_at&order=desc&had_retries=some)
|
||||
- [ ] Check [cloud SLO dashboard](https://observer.zenith.tech/d/_oWcBMJ7k/cloud-slos?orgId=1)
|
||||
- [ ] Check [compute startup metrics dashboard](https://observer.zenith.tech/d/5OkYJEmVz/compute-startup-time)
|
||||
|
||||
<!-- List everything that should be done **after** release, any admin UI configuration / Grafana dashboard / alert changes / setting changes / etc -->
|
||||
217
.github/actions/allure-report/action.yml
vendored
Normal file
217
.github/actions/allure-report/action.yml
vendored
Normal file
@@ -0,0 +1,217 @@
|
||||
name: 'Create Allure report'
|
||||
description: 'Create and publish Allure report'
|
||||
|
||||
inputs:
|
||||
action:
|
||||
desctiption: 'generate or store'
|
||||
required: true
|
||||
build_type:
|
||||
description: '`build_type` from run-python-test-set action'
|
||||
required: true
|
||||
test_selection:
|
||||
description: '`test_selector` from run-python-test-set action'
|
||||
required: false
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Validate input parameters
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
|
||||
echo 2>&1 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
|
||||
echo 2>&1 "inputs.test_selection must be set for 'store' action"
|
||||
exit 2
|
||||
fi
|
||||
|
||||
- name: Calculate key
|
||||
id: calculate-key
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
|
||||
|
||||
pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
|
||||
if [ "${pr_number}" != "null" ]; then
|
||||
key=pr-${pr_number}
|
||||
elif [ "${GITHUB_REF}" = "refs/heads/main" ]; then
|
||||
# Shortcut for a special branch
|
||||
key=main
|
||||
else
|
||||
key=branch-$(echo ${GITHUB_REF#refs/heads/} | tr -c "[:alnum:]._-" "-")
|
||||
fi
|
||||
echo "::set-output name=KEY::${key}"
|
||||
|
||||
- uses: actions/setup-java@v3
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
with:
|
||||
distribution: 'temurin'
|
||||
java-version: '17'
|
||||
|
||||
- name: Install Allure
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
if ! which allure; then
|
||||
ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
|
||||
wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
|
||||
echo "${ALLURE_ZIP_MD5} ${ALLURE_ZIP}" | md5sum -c
|
||||
unzip -q ${ALLURE_ZIP}
|
||||
echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
|
||||
rm -f ${ALLURE_ZIP}
|
||||
fi
|
||||
env:
|
||||
ALLURE_VERSION: 2.19.0
|
||||
ALLURE_ZIP_MD5: ced21401a1a8b9dfb68cee9e4c210464
|
||||
|
||||
- name: Upload Allure results
|
||||
if: ${{ inputs.action == 'store' }}
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Add metadata
|
||||
cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
|
||||
{
|
||||
"name": "GitHub Actions",
|
||||
"type": "github",
|
||||
"url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
|
||||
"buildOrder": ${GITHUB_RUN_ID},
|
||||
"buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
|
||||
"buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
|
||||
"reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
|
||||
"reportName": "Allure Report"
|
||||
}
|
||||
EOF
|
||||
cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
|
||||
TEST_SELECTION=${{ inputs.test_selection }}
|
||||
BUILD_TYPE=${{ inputs.build_type }}
|
||||
EOF
|
||||
|
||||
ARCHIVE="${GITHUB_RUN_ID}-${{ inputs.test_selection }}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
|
||||
aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
|
||||
|
||||
# Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
|
||||
- name: Acquire Allure lock
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
run: |
|
||||
LOCK_TIMEOUT=300 # seconds
|
||||
|
||||
for _ in $(seq 1 5); do
|
||||
for i in $(seq 1 ${LOCK_TIMEOUT}); do
|
||||
LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
|
||||
# `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
|
||||
if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" > lock.txt
|
||||
aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
|
||||
|
||||
# A double-check that exactly WE have acquired the lock
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
- name: Generate and publish final Allure report
|
||||
if: ${{ inputs.action == 'generate' }}
|
||||
id: generate-report
|
||||
env:
|
||||
REPORT_PREFIX: reports/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
RAW_PREFIX: reports-raw/${{ steps.calculate-key.outputs.KEY }}/${{ inputs.build_type }}
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUCKET: neon-github-public-dev
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Get previously uploaded data for this run
|
||||
ZSTD_NBTHREADS=0
|
||||
|
||||
s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output '.Contents[].Key')
|
||||
if [ -z "$s3_filepaths" ]; then
|
||||
# There's no previously uploaded data for this run
|
||||
exit 0
|
||||
fi
|
||||
for s3_filepath in ${s3_filepaths}; do
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
|
||||
|
||||
archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
|
||||
mkdir -p ${archive%.tar.zst}
|
||||
tar -xf ${archive} -C ${archive%.tar.zst}
|
||||
rm -f ${archive}
|
||||
done
|
||||
|
||||
# Get history trend
|
||||
aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
|
||||
|
||||
# Generate report
|
||||
allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
|
||||
|
||||
# Replace a logo link with a redirect to the latest version of the report
|
||||
sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
|
||||
|
||||
# Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
|
||||
aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
|
||||
aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
|
||||
|
||||
REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
|
||||
|
||||
# Generate redirect
|
||||
cat <<EOF > ./index.html
|
||||
<!DOCTYPE html>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<title>Redirecting to ${REPORT_URL}</title>
|
||||
<meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
|
||||
EOF
|
||||
aws s3 cp --only-show-errors ./index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
|
||||
|
||||
echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
|
||||
echo "::set-output name=REPORT_URL::${REPORT_URL}"
|
||||
|
||||
- name: Release Allure lock
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
LOCK_FILE: reports/${{ steps.calculate-key.outputs.KEY }}/lock.txt
|
||||
BUCKET: neon-github-public-dev
|
||||
run: |
|
||||
aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
|
||||
|
||||
if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${{ inputs.test_selection }}" ]; then
|
||||
aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
|
||||
fi
|
||||
|
||||
- uses: actions/github-script@v6
|
||||
if: ${{ inputs.action == 'generate' && always() }}
|
||||
env:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.REPORT_URL }}
|
||||
BUILD_TYPE: ${{ inputs.build_type }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
script: |
|
||||
const { REPORT_URL, BUILD_TYPE, SHA } = process.env
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
sha: `${SHA}`,
|
||||
state: 'success',
|
||||
target_url: `${REPORT_URL}`,
|
||||
context: `Allure report / ${BUILD_TYPE}`,
|
||||
})
|
||||
41
.github/actions/run-python-test-set/action.yml
vendored
41
.github/actions/run-python-test-set/action.yml
vendored
@@ -3,11 +3,11 @@ description: 'Runs a Neon python test set, performing all the required preparati
|
||||
|
||||
inputs:
|
||||
build_type:
|
||||
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
|
||||
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug", or "remote" for the remote cluster'
|
||||
required: true
|
||||
rust_toolchain:
|
||||
description: 'Rust toolchain version to fetch the caches'
|
||||
required: true
|
||||
required: false
|
||||
test_selection:
|
||||
description: 'A python test suite to run'
|
||||
required: true
|
||||
@@ -24,7 +24,7 @@ inputs:
|
||||
required: false
|
||||
default: 'true'
|
||||
save_perf_report:
|
||||
description: 'Whether to upload the performance report'
|
||||
description: 'Whether to upload the performance report, if true PERF_TEST_RESULT_CONNSTR env variable should be set'
|
||||
required: false
|
||||
default: 'false'
|
||||
run_with_real_s3:
|
||||
@@ -52,6 +52,7 @@ runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Get Neon artifact
|
||||
if: inputs.build_type != 'remote'
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
|
||||
@@ -78,7 +79,6 @@ runs:
|
||||
- name: Run pytest
|
||||
env:
|
||||
NEON_BIN: /tmp/neon/bin
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
# this variable will be embedded in perf test report
|
||||
# and is needed to distinguish different environments
|
||||
@@ -88,6 +88,12 @@ runs:
|
||||
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
||||
|
||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||
export REMOTE_ENV=1
|
||||
fi
|
||||
|
||||
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
|
||||
rm -rf $PERF_REPORT_DIR
|
||||
|
||||
@@ -119,6 +125,13 @@ runs:
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
cov_prefix=()
|
||||
else
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
# Wake up the cluster if we use remote neon instance
|
||||
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
|
||||
${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
|
||||
fi
|
||||
|
||||
# Run the tests.
|
||||
@@ -131,11 +144,12 @@ runs:
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
mkdir -p $TEST_OUTPUT/allure/results
|
||||
"${cov_prefix[@]}" ./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--alluredir=$TEST_OUTPUT/allure/results \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
-m "not remote_cluster" \
|
||||
-rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
|
||||
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||
@@ -146,17 +160,10 @@ runs:
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Delete all data but logs
|
||||
shell: bash -euxo pipefail {0}
|
||||
- name: Create Allure report
|
||||
if: always()
|
||||
run: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
|
||||
- name: Upload python test logs
|
||||
if: always()
|
||||
uses: ./.github/actions/upload
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
|
||||
path: /tmp/test_output/
|
||||
action: store
|
||||
build_type: ${{ inputs.build_type }}
|
||||
test_selection: ${{ inputs.test_selection }}
|
||||
|
||||
92
.github/workflows/benchmarking.yml
vendored
92
.github/workflows/benchmarking.yml
vendored
@@ -106,7 +106,7 @@ jobs:
|
||||
mkdir -p perf-report-staging
|
||||
# Set --sparse-ordering option of pytest-order plugin to ensure tests are running in order of appears in the file,
|
||||
# it's important for test_perf_pgbench.py::test_pgbench_remote_* tests
|
||||
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --skip-interfering-proc-check --out-dir perf-report-staging --timeout 5400
|
||||
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --sparse-ordering --out-dir perf-report-staging --timeout 5400
|
||||
|
||||
- name: Submit result
|
||||
env:
|
||||
@@ -128,9 +128,9 @@ jobs:
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10gb"
|
||||
REMOTE_ENV: "1"
|
||||
POSTGRES_DISTRIB_DIR: /usr
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -138,23 +138,15 @@ jobs:
|
||||
connstr: [ BENCHMARK_CAPTEST_CONNSTR, BENCHMARK_RDS_CONNSTR ]
|
||||
|
||||
runs-on: dev
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2817580636
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:pinned
|
||||
options: --init
|
||||
|
||||
timeout-minutes: 360 # 6h
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Cache poetry deps
|
||||
id: cache_poetry
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Calculate platform
|
||||
id: calculate-platform
|
||||
env:
|
||||
@@ -173,50 +165,56 @@ jobs:
|
||||
|
||||
- name: Install Deps
|
||||
run: |
|
||||
echo "deb http://apt.postgresql.org/pub/repos/apt focal-pgdg main" | sudo tee /etc/apt/sources.list.d/pgdg.list
|
||||
wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
|
||||
sudo apt -y update
|
||||
sudo apt install -y postgresql-14 postgresql-client-14
|
||||
sudo apt install -y postgresql-14
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
run: |
|
||||
mkdir -p perf-report-captest
|
||||
|
||||
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
|
||||
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_init -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
|
||||
|
||||
- name: Benchmark simple-update
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
run: |
|
||||
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
|
||||
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_simple_update -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
|
||||
|
||||
- name: Benchmark select-only
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
run: |
|
||||
psql $BENCHMARK_CONNSTR -c "SELECT 1;"
|
||||
./scripts/pytest test_runner/performance/test_perf_pgbench.py::test_pgbench_remote_select_only -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-captest --timeout 21600
|
||||
|
||||
- name: Submit result
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
run: |
|
||||
REPORT_FROM=$(realpath perf-report-captest) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
|
||||
|
||||
- name: Upload logs
|
||||
if: always()
|
||||
uses: ./.github/actions/upload
|
||||
- name: Benchmark simple-update
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
name: bench-captest-${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
path: /tmp/test_output/
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Benchmark simple-update
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
|
||||
env:
|
||||
PLATFORM: ${{ steps.calculate-platform.outputs.PLATFORM }}
|
||||
BENCHMARK_CONNSTR: ${{ secrets[matrix.connstr] }}
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Create Allure report
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: ${{ github.event.schedule && failure() }}
|
||||
|
||||
105
.github/workflows/build_and_test.yml
vendored
105
.github/workflows/build_and_test.yml
vendored
@@ -95,11 +95,11 @@ jobs:
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FEATURES=""
|
||||
CARGO_FLAGS=""
|
||||
CARGO_FLAGS="--locked"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FEATURES="--features profiling"
|
||||
CARGO_FLAGS="--release $CARGO_FEATURES"
|
||||
CARGO_FLAGS="--locked --release $CARGO_FEATURES"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
|
||||
@@ -121,8 +121,8 @@ jobs:
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
v6-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
|
||||
v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
|
||||
|
||||
- name: Cache postgres build
|
||||
id: cache_pg
|
||||
@@ -136,6 +136,10 @@ jobs:
|
||||
run: mold -run make postgres -j$(nproc)
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Build neon extensions
|
||||
run: mold -run make neon-pg-ext -j$(nproc)
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
@@ -202,7 +206,7 @@ jobs:
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
pg_regress-tests:
|
||||
regress-tests:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
@@ -220,42 +224,13 @@ jobs:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Pytest regress tests
|
||||
- name: Pytest regression tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||
test_selection: batch_pg_regress
|
||||
test_selection: regress
|
||||
needs_postgres_source: true
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
other-tests:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ build-neon ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Pytest other tests
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||
test_selection: batch_others
|
||||
run_with_real_s3: true
|
||||
real_s3_bucket: ci-tests-s3
|
||||
real_s3_region: us-west-2
|
||||
@@ -298,12 +273,35 @@ jobs:
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
merge-allure-report:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests, benchmarks ]
|
||||
if: always()
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
|
||||
- name: Create Allure report
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ matrix.build_type }}
|
||||
|
||||
coverage-report:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ other-tests, pg_regress-tests ]
|
||||
needs: [ regress-tests ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -325,7 +323,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v5-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
key: v7-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -460,19 +458,18 @@ jobs:
|
||||
- name: Configure ECR login
|
||||
run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
|
||||
|
||||
- name: Kaniko build compute node
|
||||
working-directory: ./vendor/postgres/
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
|
||||
- name: Kaniko build compute node with extensions
|
||||
run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID
|
||||
|
||||
promote-images:
|
||||
runs-on: dev
|
||||
needs: [ neon-image, compute-tools-image, compute-node-image ]
|
||||
needs: [ neon-image, compute-node-image, compute-tools-image ]
|
||||
if: github.event_name != 'workflow_dispatch'
|
||||
container: amazon/aws-cli
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
name: [ neon, compute-tools, compute-node ]
|
||||
name: [ neon, compute-node, compute-tools ]
|
||||
|
||||
steps:
|
||||
- name: Promote image to latest
|
||||
@@ -489,18 +486,6 @@ jobs:
|
||||
run: |
|
||||
go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
|
||||
go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
|
||||
|
||||
# - name: Get build tag
|
||||
# run: |
|
||||
# if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
# echo "::set-output name=tag::$(git rev-list --count HEAD)"
|
||||
# elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
# echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
|
||||
# else
|
||||
# echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release' "
|
||||
# echo "::set-output name=tag::$GITHUB_RUN_ID"
|
||||
# fi
|
||||
# id: build-tag
|
||||
|
||||
- name: Configure ECR login
|
||||
run: |
|
||||
@@ -516,6 +501,9 @@ jobs:
|
||||
- name: Pull compute node image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:latest compute-node
|
||||
|
||||
- name: Pull rust image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
|
||||
|
||||
- name: Configure docker login
|
||||
run: |
|
||||
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
|
||||
@@ -531,6 +519,9 @@ jobs:
|
||||
- name: Push compute node image to Docker Hub
|
||||
run: crane push compute-node neondatabase/compute-node:${{needs.tag.outputs.build-tag}}
|
||||
|
||||
- name: Push rust image to Docker Hub
|
||||
run: crane push rust neondatabase/rust:pinned
|
||||
|
||||
- name: Add latest tag to images
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
@@ -567,7 +558,7 @@ jobs:
|
||||
#container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
# We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
|
||||
# If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag ]
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
@@ -622,7 +613,7 @@ jobs:
|
||||
runs-on: dev
|
||||
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:latest
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag ]
|
||||
needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
|
||||
21
.github/workflows/codestyle.yml
vendored
21
.github/workflows/codestyle.yml
vendored
@@ -65,7 +65,7 @@ jobs:
|
||||
|
||||
- name: Cache postgres build
|
||||
id: cache_pg
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
tmp_install/
|
||||
@@ -81,6 +81,9 @@ jobs:
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: make postgres
|
||||
|
||||
- name: Build neon extensions
|
||||
run: make neon-pg-ext
|
||||
|
||||
# Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
|
||||
# and the real cause will be inside config.log
|
||||
- name: Print configure logs in case of failure
|
||||
@@ -94,20 +97,20 @@ jobs:
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
uses: actions/cache@v2
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v2-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
|
||||
key: v3-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
- name: Ensure all project builds
|
||||
run: cargo build --all --all-targets
|
||||
run: cargo build --locked --all --all-targets
|
||||
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
@@ -128,8 +131,14 @@ jobs:
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run yapf to ensure code format
|
||||
run: poetry run yapf --recursive --diff .
|
||||
- name: Run isort to ensure code format
|
||||
run: poetry run isort --diff --check .
|
||||
|
||||
- name: Run black to ensure code format
|
||||
run: poetry run black --diff --check .
|
||||
|
||||
- name: Run flake8 to ensure code format
|
||||
run: poetry run flake8 .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
45
.github/workflows/notifications.yml
vendored
45
.github/workflows/notifications.yml
vendored
@@ -1,45 +0,0 @@
|
||||
name: Send Notifications
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
send-notifications:
|
||||
timeout-minutes: 30
|
||||
name: send commit notifications
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 2
|
||||
|
||||
- name: Form variables for notification message
|
||||
id: git_info_grab
|
||||
run: |
|
||||
git_stat=$(git show --stat=50)
|
||||
git_stat="${git_stat//'%'/'%25'}"
|
||||
git_stat="${git_stat//$'\n'/'%0A'}"
|
||||
git_stat="${git_stat//$'\r'/'%0D'}"
|
||||
git_stat="${git_stat// / }" # space -> 'Space En', as github tends to eat ordinary spaces
|
||||
echo "::set-output name=git_stat::$git_stat"
|
||||
echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
|
||||
echo "##[set-output name=git_branch;]$(echo ${GITHUB_REF#refs/heads/})"
|
||||
|
||||
- name: Send notification
|
||||
uses: appleboy/telegram-action@master
|
||||
with:
|
||||
to: ${{ secrets.TELEGRAM_TO }}
|
||||
token: ${{ secrets.TELEGRAM_TOKEN }}
|
||||
format: markdown
|
||||
args: |
|
||||
*@${{ github.actor }} pushed to* [${{ github.repository }}:${{steps.git_info_grab.outputs.git_branch}}](github.com/${{ github.repository }}/commit/${{steps.git_info_grab.outputs.sha_short }})
|
||||
|
||||
```
|
||||
${{ steps.git_info_grab.outputs.git_stat }}
|
||||
```
|
||||
|
||||
10
.yapfignore
10
.yapfignore
@@ -1,10 +0,0 @@
|
||||
# This file is only read when `yapf` is run from this directory.
|
||||
# Hence we only top-level directories here to avoid confusion.
|
||||
# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
|
||||
vendor/
|
||||
target/
|
||||
tmp_install/
|
||||
__pycache__/
|
||||
test_output/
|
||||
.neon/
|
||||
.git/
|
||||
247
Cargo.lock
generated
247
Cargo.lock
generated
@@ -48,9 +48,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.58"
|
||||
version = "1.0.59"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bb07d2053ccdbe10e2af2995a2f116c1330396493dc1269f6a91d0ae82e19704"
|
||||
checksum = "c91f1f46651137be86f3a2b9a8359f9ab421d04d941c62b5982e1ca21113adf9"
|
||||
dependencies = [
|
||||
"backtrace",
|
||||
]
|
||||
@@ -77,7 +77,7 @@ dependencies = [
|
||||
"num-traits",
|
||||
"rusticata-macros",
|
||||
"thiserror",
|
||||
"time 0.3.11",
|
||||
"time 0.3.12",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -126,9 +126,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "async-trait"
|
||||
version = "0.1.56"
|
||||
version = "0.1.57"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96cf8829f67d2eab0b2dfa42c5d0ef737e0724e4a82b01b3e292456202b19716"
|
||||
checksum = "76464446b8bc32758d7e88ee1a804d9914cd9b1cb264c029899680b0be29826f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -166,7 +166,7 @@ dependencies = [
|
||||
"http",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"itoa 1.0.2",
|
||||
"itoa 1.0.3",
|
||||
"matchit",
|
||||
"memchr",
|
||||
"mime",
|
||||
@@ -298,9 +298,9 @@ checksum = "37ccbd214614c6783386c1af30caf03192f17891059cecc394b4fb119e363de3"
|
||||
|
||||
[[package]]
|
||||
name = "bytemuck"
|
||||
version = "1.10.0"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c53dfa917ec274df8ed3c572698f381a24eef2efba9492d797301b72b6db408a"
|
||||
checksum = "a5377c8865e74a160d21f29c2d40669f53286db6eab59b88540cbb12ffc8b835"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
@@ -310,9 +310,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
|
||||
|
||||
[[package]]
|
||||
name = "bytes"
|
||||
version = "1.1.0"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8"
|
||||
checksum = "ec8a7b6a70fde80372154c65702f00a0f56f3e1c36abbc6c440484be248856db"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
@@ -386,9 +386,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "3.2.12"
|
||||
version = "3.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab8b79fe3946ceb4a0b1c080b4018992b8d27e9ff363644c1c9b6387c854614d"
|
||||
checksum = "a3dbbb6653e7c55cc8595ad3e1f7be8f32aba4eb7ff7f0fd1163d4f3d137c0a9"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"bitflags",
|
||||
@@ -455,7 +455,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"chrono",
|
||||
"clap 3.2.12",
|
||||
"clap 3.2.16",
|
||||
"env_logger",
|
||||
"hyper",
|
||||
"log",
|
||||
@@ -601,9 +601,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.5"
|
||||
version = "0.5.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c02a4d71819009c192cf4872265391563fd6a84c81ff2c0f2a7026ca4c1d85c"
|
||||
checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
@@ -611,9 +611,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.1"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e"
|
||||
checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-epoch",
|
||||
@@ -622,9 +622,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.9"
|
||||
version = "0.9.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07db9d94cbd326813772c968ccd25999e5f8ae22f4f8d1b11effa37ef6ce281d"
|
||||
checksum = "045ebe27666471bb549370b4b0b3e51b07f56325befa4284db65fc89c02511b1"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
@@ -636,9 +636,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.10"
|
||||
version = "0.8.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d82ee10ce34d7bc12c2122495e7593a9c41347ecdd64185af4ecf72cb1a7f83"
|
||||
checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
@@ -917,9 +917,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "1.7.0"
|
||||
version = "1.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3fcf0cee53519c866c09b5de1f6c56ff9d647101f81c1964fa632e148896cdf"
|
||||
checksum = "a7a407cfaa3385c4ae6b23e84623d48c2798d06e3e6a1878f7f59f17b3f86499"
|
||||
dependencies = [
|
||||
"instant",
|
||||
]
|
||||
@@ -1086,9 +1086,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.5"
|
||||
version = "0.14.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd48d33ec7f05fbfa152300fdad764757cbded343c1aa1cff2fbaf4134851803"
|
||||
checksum = "bff49e947297f3312447abdca79f45f4738097cc82b06e72054d2223f601f1b9"
|
||||
dependencies = [
|
||||
"typenum",
|
||||
"version_check",
|
||||
@@ -1164,20 +1164,14 @@ version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
@@ -1245,7 +1239,7 @@ checksum = "75f43d41e26995c17e71ee126451dd3941010b0514a81a9d11f3b341debc2399"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
"itoa 1.0.2",
|
||||
"itoa 1.0.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1308,7 +1302,7 @@ dependencies = [
|
||||
"http-body",
|
||||
"httparse",
|
||||
"httpdate",
|
||||
"itoa 1.0.2",
|
||||
"itoa 1.0.3",
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"tokio",
|
||||
@@ -1379,7 +1373,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "10a35a97730320ffe8e2d410b5d3b69279b98d2c14bdb8b70ea89ecf7888d41e"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"hashbrown 0.12.3",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1391,7 +1385,7 @@ dependencies = [
|
||||
"ahash",
|
||||
"atty",
|
||||
"indexmap",
|
||||
"itoa 1.0.2",
|
||||
"itoa 1.0.3",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"num-format",
|
||||
@@ -1432,15 +1426,15 @@ checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.2"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "112c678d4050afce233f4f2852bb2eb519230b3cf12f33585275537d7e41578d"
|
||||
checksum = "6c8af84674fe1f223a982c933a0ee1086ac4d4052aa0fb8060c12c6ad838e754"
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.58"
|
||||
version = "0.3.59"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3fac17f7123a73ca62df411b1bf727ccc805daa070338fda671c86dac1bdc27"
|
||||
checksum = "258451ab10b34f8af53416d1fdab72c22e805f0c92a1136d59470ec0b11138b2"
|
||||
dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
@@ -1482,9 +1476,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.126"
|
||||
version = "0.2.127"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
|
||||
checksum = "505e71a4706fa491e9b1b55f51b95d4037d0821ee40131190475f692b35b009b"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
@@ -1659,7 +1653,7 @@ name = "neon_local"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 3.2.12",
|
||||
"clap 3.2.16",
|
||||
"comfy-table",
|
||||
"control_plane",
|
||||
"git-version",
|
||||
@@ -1854,7 +1848,7 @@ dependencies = [
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 3.2.12",
|
||||
"clap 3.2.16",
|
||||
"close_fds",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
@@ -2111,7 +2105,6 @@ dependencies = [
|
||||
"bindgen",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"crc32c",
|
||||
"env_logger",
|
||||
"hex",
|
||||
@@ -2155,9 +2148,9 @@ checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
|
||||
|
||||
[[package]]
|
||||
name = "prettyplease"
|
||||
version = "0.1.16"
|
||||
version = "0.1.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da6ffbe862780245013cb1c0a48c4e44b7d665548088f91f6b90876d0625e4c2"
|
||||
checksum = "697ae720ee02011f439e0701db107ffe2916d83f718342d65d7f8bf7b8a5fee9"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn",
|
||||
@@ -2171,9 +2164,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.40"
|
||||
version = "1.0.43"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dd96a1e8ed2596c337f8eae5f24924ec83f5ad5ab21ea8e455d3566c69fbcaf7"
|
||||
checksum = "0a2ca2c61bc9f3d74d2886294ab7b9853abd9c1ad903a3ac7815c58989bb7bab"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
@@ -2271,13 +2264,14 @@ dependencies = [
|
||||
"base64",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"clap 3.2.12",
|
||||
"clap 3.2.16",
|
||||
"futures",
|
||||
"git-version",
|
||||
"hashbrown 0.11.2",
|
||||
"hashbrown",
|
||||
"hex",
|
||||
"hmac 0.12.1",
|
||||
"hyper",
|
||||
"itertools",
|
||||
"md5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
@@ -2289,7 +2283,7 @@ dependencies = [
|
||||
"routerify",
|
||||
"rstest",
|
||||
"rustls",
|
||||
"rustls-pemfile 0.2.1",
|
||||
"rustls-pemfile",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -2315,20 +2309,11 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quickcheck"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "588f6378e4dd99458b60ec275b4477add41ce4fa9f64dcba6f15adccb19b50d6"
|
||||
dependencies = [
|
||||
"rand",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.20"
|
||||
version = "1.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3bcdf212e9776fbcb2d23ab029360416bb1706b1aea2d1a5ba002727cbcab804"
|
||||
checksum = "bbe448f377a7d6961e30f5955f9b8d106c3f5e449d493ee1b125c1d43c2b5179"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
@@ -2411,9 +2396,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.2.13"
|
||||
version = "0.2.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "62f25bc4c7e55e0b0b7a1d43fb893f4fa1361d0abe38b9ce4f323c2adfe6ef42"
|
||||
checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
@@ -2508,7 +2493,7 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls",
|
||||
"rustls-pemfile 1.0.0",
|
||||
"rustls-pemfile",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
@@ -2699,18 +2684,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pemfile"
|
||||
version = "0.2.1"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5eebeaeb360c87bfb72e84abdb3447159c0eaececf1bef2aecd65a8be949d1c9"
|
||||
dependencies = [
|
||||
"base64",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pemfile"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e7522c9de787ff061458fe9a829dc790a3f5b22dc571694fc5883f448b94d9a9"
|
||||
checksum = "0864aeff53f8c05aa08d86e5ef839d3dfcf07aeba2db32f12db0ef716e87bd55"
|
||||
dependencies = [
|
||||
"base64",
|
||||
]
|
||||
@@ -2726,15 +2702,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.8"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24c8ad4f0c00e1eb5bc7614d236a7f1300e3dbd76b68cac8e06fb00b015ad8d8"
|
||||
checksum = "97477e48b4cf8603ad5f7aaf897467cf42ab4218a38ef76fb14c2d6773a6d6a8"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.10"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3f6f92acf49d1b98f7a81226834412ada05458b7364277387724a237f062695"
|
||||
checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09"
|
||||
|
||||
[[package]]
|
||||
name = "safekeeper"
|
||||
@@ -2744,7 +2720,7 @@ dependencies = [
|
||||
"async-trait",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"clap 3.2.12",
|
||||
"clap 3.2.16",
|
||||
"const_format",
|
||||
"crc32c",
|
||||
"daemonize",
|
||||
@@ -2835,15 +2811,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.12"
|
||||
version = "1.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2333e6df6d6598f2b1974829f853c2b4c5f4a6e503c10af918081aa6f8564e1"
|
||||
checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.139"
|
||||
version = "1.0.142"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0171ebb889e45aa68b44aee0859b3eede84c6f5f5c228e6f140c0b2a0a46cad6"
|
||||
checksum = "e590c437916fb6b221e1d00df6e3294f3fccd70ca7e92541c475d6ed6ef5fee2"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
@@ -2860,9 +2836,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.139"
|
||||
version = "1.0.142"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc1d3230c1de7932af58ad8ffbe1d784bd55efd5a9d84ac24f69c72d83543dfb"
|
||||
checksum = "34b5b8d809babe02f538c2cfec6f2c1ed10804c0e5a6a041a049a4f5588ccc2e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -2871,11 +2847,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.82"
|
||||
version = "1.0.83"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "82c2c1fdcd807d1098552c5b9a36e425e42e9fbd7c6a37a8425f390f781f7fa7"
|
||||
checksum = "38dd04e3c8279e75b31ef29dbdceebfe5ad89f4d0937213c53f7d49d01b3d5a7"
|
||||
dependencies = [
|
||||
"itoa 1.0.2",
|
||||
"itoa 1.0.3",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
@@ -2887,7 +2863,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"itoa 1.0.2",
|
||||
"itoa 1.0.3",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
@@ -2992,7 +2968,7 @@ dependencies = [
|
||||
"num-bigint",
|
||||
"num-traits",
|
||||
"thiserror",
|
||||
"time 0.3.11",
|
||||
"time 0.3.12",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3003,9 +2979,12 @@ checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.6"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb703cfe953bccee95685111adeedb76fabe4e97549a58d16f03ea7b9367bb32"
|
||||
checksum = "4614a76b2a8be0058caa9dbbaf66d988527d86d003c11a94fbd335d7661edcef"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
@@ -3113,9 +3092,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.98"
|
||||
version = "1.0.99"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c50aef8a904de4c23c788f104b7dddc7d6f79c647c7c8ce4cc8f73eb0ca773dd"
|
||||
checksum = "58dbef6ec655055e20b86b15a8cc6d439cca19b667537ac6a1369572d151ab13"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -3191,18 +3170,18 @@ checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.31"
|
||||
version = "1.0.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a"
|
||||
checksum = "f5f6586b7f764adc0231f4c79be7b920e766bb2f3e51b3661cdb263828f19994"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.31"
|
||||
version = "1.0.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a"
|
||||
checksum = "12bafc5b54507e0149cdf1b145a5d80ab80a90bcd9275df43d4fff68460f6c21"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -3231,14 +3210,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72c91f41dcb2f096c05f0873d667dceec1087ce5bcf984ec8ffb19acddbb3217"
|
||||
checksum = "74b7cc93fc23ba97fde84f7eea56c55d1ba183f495c6715defdfc7b9cb8c870f"
|
||||
dependencies = [
|
||||
"itoa 1.0.2",
|
||||
"itoa 1.0.3",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"num_threads",
|
||||
"quickcheck",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
@@ -3275,9 +3254,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.20.0"
|
||||
version = "1.20.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57aec3cfa4c296db7255446efb4928a6be304b431a806216105542a67b6ca82e"
|
||||
checksum = "7a8325f63a7d4774dd041e363b2409ed1c5cbbd0f867795e661df066b2b0a581"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"bytes",
|
||||
@@ -3607,9 +3586,9 @@ checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.2"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15c61ba63f9235225a22310255a29b806b907c9b8c964bcbd0a2c70f3f2deea7"
|
||||
checksum = "c4f5b37a154999a8f3f98cc23a628d850e154479cd94decf3414696e12e31aaf"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
@@ -3679,7 +3658,7 @@ dependencies = [
|
||||
"rand",
|
||||
"routerify",
|
||||
"rustls",
|
||||
"rustls-pemfile 0.2.1",
|
||||
"rustls-pemfile",
|
||||
"rustls-split",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -3728,7 +3707,7 @@ name = "wal_craft"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 3.2.12",
|
||||
"clap 3.2.16",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
@@ -3772,9 +3751,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.81"
|
||||
version = "0.2.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7c53b543413a17a202f4be280a7e5c62a1c69345f5de525ee64f8cfdbc954994"
|
||||
checksum = "fc7652e3f6c4706c8d9cd54832c4a4ccb9b5336e2c3bd154d5cccfbf1c1f5f7d"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"wasm-bindgen-macro",
|
||||
@@ -3782,13 +3761,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.81"
|
||||
version = "0.2.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5491a68ab4500fa6b4d726bd67408630c3dbe9c4fe7bda16d5c82a1fd8c7340a"
|
||||
checksum = "662cd44805586bd52971b9586b1df85cdbbd9112e4ef4d8f41559c334dc6ac3f"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"once_cell",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
@@ -3797,9 +3776,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.31"
|
||||
version = "0.4.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "de9a9cec1733468a8c657e57fa2413d2ae2c0129b95e87c5b72b8ace4d13f31f"
|
||||
checksum = "fa76fb221a1f8acddf5b54ace85912606980ad661ac7a503b4570ffd3a624dad"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
@@ -3809,9 +3788,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.81"
|
||||
version = "0.2.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c441e177922bc58f1e12c022624b6216378e5febc2f0533e41ba443d505b80aa"
|
||||
checksum = "b260f13d3012071dfb1512849c033b1925038373aea48ced3012c09df952c602"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
@@ -3819,9 +3798,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.81"
|
||||
version = "0.2.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d94ac45fcf608c1f45ef53e748d35660f168490c10b23704c7779ab8f5c3048"
|
||||
checksum = "5be8e654bdd9b79216c2929ab90721aa82faf65c48cdf08bdc4e7f51357b80da"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -3832,15 +3811,15 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.81"
|
||||
version = "0.2.82"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a89911bd99e5f3659ec4acf9c4d93b0a90fe4a2a11f15328472058edc5261be"
|
||||
checksum = "6598dd0bd3c7d51095ff6531a5b23e02acdc81804e30d8f07afb77b7215a140a"
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.58"
|
||||
version = "0.3.59"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2fed94beee57daf8dd7d51f2b15dc2bcde92d7a72304cdf662a4371008b71b90"
|
||||
checksum = "ed055ab27f941423197eb86b2035720b1a3ce40504df082cac2ecc6ed73335a1"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
@@ -3965,6 +3944,7 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"anyhow",
|
||||
"bstr",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 2.34.0",
|
||||
@@ -3974,7 +3954,7 @@ dependencies = [
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"hashbrown 0.11.2",
|
||||
"hashbrown",
|
||||
"hex",
|
||||
"hyper",
|
||||
"indexmap",
|
||||
@@ -3989,11 +3969,12 @@ dependencies = [
|
||||
"prost",
|
||||
"rand",
|
||||
"regex",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"syn",
|
||||
"time 0.3.11",
|
||||
"time 0.3.12",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
@@ -4015,7 +3996,7 @@ dependencies = [
|
||||
"oid-registry",
|
||||
"rusticata-macros",
|
||||
"thiserror",
|
||||
"time 0.3.11",
|
||||
"time 0.3.12",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4044,6 +4025,6 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.5.6"
|
||||
version = "1.5.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20b578acffd8516a6c3f2a1bdefc1ec37e547bb4e0fb8b6b01a4cafc886b4442"
|
||||
checksum = "c394b5bd0c6f669e7275d9c20aa90ae064cb22e75a1cad54e1b34088034b149f"
|
||||
|
||||
34
Dockerfile
34
Dockerfile
@@ -1,18 +1,27 @@
|
||||
### Creates a storage Docker image with postgres, pageserver, safekeeper and proxy binaries.
|
||||
### The image itself is mainly used as a container for the binaries and for starting e2e tests with custom parameters.
|
||||
### By default, the binaries inside the image have some mock parameters and can start, but are not intended to be used
|
||||
### inside this image in the real deployments.
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
# Build Postgres
|
||||
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS pg-build
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS pg-build
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
COPY vendor/postgres vendor/postgres
|
||||
COPY Makefile Makefile
|
||||
COPY --chown=nonroot vendor/postgres vendor/postgres
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
|
||||
ENV BUILD_TYPE release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s postgres \
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext \
|
||||
&& rm -rf tmp_install/build \
|
||||
&& tar -C tmp_install -czf /home/nonroot/postgres_install.tar.gz .
|
||||
|
||||
# Build zenith binaries
|
||||
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS build
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
WORKDIR /home/nonroot
|
||||
ARG GIT_VERSION=local
|
||||
|
||||
@@ -32,7 +41,7 @@ COPY . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build --release \
|
||||
&& mold -run cargo build --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
@@ -58,7 +67,18 @@ COPY --from=build --chown=zenith:zenith /home/nonroot/target/release/proxy
|
||||
COPY --from=pg-build /home/nonroot/tmp_install/ /usr/local/
|
||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
|
||||
RUN mkdir -p /data/.neon/ && chown -R zenith:zenith /data/.neon/ \
|
||||
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
-c "id=1234" \
|
||||
-c "broker_endpoints=['http://etcd:2379']" \
|
||||
-c "pg_distrib_dir='/usr/local'" \
|
||||
-c "listen_pg_addr='0.0.0.0:6400'" \
|
||||
-c "listen_http_addr='0.0.0.0:9898'"
|
||||
|
||||
VOLUME ["/data"]
|
||||
USER zenith
|
||||
EXPOSE 6400
|
||||
CMD ["pageserver"]
|
||||
EXPOSE 9898
|
||||
CMD ["/bin/bash"]
|
||||
|
||||
114
Dockerfile.compute-node
Normal file
114
Dockerfile.compute-node
Normal file
@@ -0,0 +1,114 @@
|
||||
ARG TAG=pinned
|
||||
# apparently, ARGs don't get replaced in RUN commands in kaniko
|
||||
# ARG POSTGIS_VERSION=3.3.0
|
||||
# ARG PLV8_VERSION=3.1.4
|
||||
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev
|
||||
|
||||
# Build Postgres from the neon postgres repository.
|
||||
FROM build-deps AS pg-build
|
||||
COPY vendor/postgres postgres
|
||||
RUN cd postgres && \
|
||||
./configure CFLAGS='-O2 -g3' --enable-debug --with-uuid=ossp && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
|
||||
|
||||
# Build PostGIS from the upstream PostGIS mirror. PostGIS compiles against neon postgres sources without changes.
|
||||
# Perhaps we could even use the upstream binaries, compiled against vanilla Postgres, but it would require some
|
||||
# investigation to check that it works, and also keeps working in the future. So for now, we compile our own binaries.
|
||||
FROM build-deps AS postgis-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
|
||||
tar xvzf postgis-3.3.0.tar.gz && \
|
||||
cd postgis-3.3.0 && \
|
||||
./autogen.sh && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
./configure && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
cd extensions/postgis && \
|
||||
make clean && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
|
||||
|
||||
# Build plv8
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
|
||||
|
||||
# https://github.com/plv8/plv8/issues/475
|
||||
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
|
||||
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends -t testing binutils
|
||||
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
tar xvzf v3.1.4.tar.gz && \
|
||||
cd plv8-3.1.4 && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
rm -rf /plv8-* && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||
|
||||
# compile neon extensions
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
PG_CONFIG=/usr/local/pgsql/bin/pg_config \
|
||||
-C pgxn/neon \
|
||||
-s install
|
||||
|
||||
# Compile and run the Neon-specific `compute_ctl` binary
|
||||
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
|
||||
USER nonroot
|
||||
# Copy entire project to get Cargo.* files with proper dependencies for the whole project
|
||||
COPY --chown=nonroot . .
|
||||
RUN cd compute_tools && cargo build --locked --release
|
||||
|
||||
# Put it all together into the final image
|
||||
FROM debian:bullseye-slim
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
# TODO: Check if we can make the extension setup more modular versus a linear build
|
||||
# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
|
||||
COPY --from=neon-pg-ext-build --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
RUN apt update && \
|
||||
apt install -y libreadline-dev libossp-uuid-dev gdal-bin libgdal-dev libprotobuf-c-dev && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# Debian bullseye provides GLIBC 2.31 when 2.34 is necessary as we compiled plv8 with that version
|
||||
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends -t testing binutils && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# "temporary" symlink for old control-plane
|
||||
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
USER postgres
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
87
Dockerfile.compute-node.legacy
Normal file
87
Dockerfile.compute-node.legacy
Normal file
@@ -0,0 +1,87 @@
|
||||
#
|
||||
# Legacy version of the Dockerfile for the compute node.
|
||||
# Used by e2e CI. Building Dockerfile.compute-node will take
|
||||
# unreasonable ammount of time without v2 runners.
|
||||
#
|
||||
# TODO: remove once cloud repo CI is moved to v2 runners.
|
||||
#
|
||||
|
||||
|
||||
# Allow specifiyng different compute-tools tag and image repo, so we are
|
||||
# able to use different images
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=compute-tools
|
||||
ARG TAG=latest
|
||||
|
||||
#
|
||||
# Image with pre-built tools
|
||||
#
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS compute-deps
|
||||
# Only to get ready compute_ctl binary as deppendency
|
||||
|
||||
#
|
||||
# Image with Postgres build deps
|
||||
#
|
||||
FROM debian:buster-slim AS build-deps
|
||||
|
||||
RUN apt-get update && apt-get -yq install automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev
|
||||
|
||||
#
|
||||
# Image with built Postgres
|
||||
#
|
||||
FROM build-deps AS pg-build
|
||||
|
||||
# Add user postgres
|
||||
RUN adduser postgres
|
||||
RUN mkdir /pg && chown postgres:postgres /pg
|
||||
|
||||
# Copy source files
|
||||
COPY ./vendor/postgres /pg/
|
||||
COPY ./pgxn /pg/
|
||||
|
||||
# Build and install Postgres locally
|
||||
RUN mkdir /pg/compute_build && cd /pg/compute_build && \
|
||||
../configure CFLAGS='-O2 -g3' --prefix=$(pwd)/postgres_bin --enable-debug --with-uuid=ossp && \
|
||||
# Install main binaries and contribs
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
|
||||
# Install headers
|
||||
make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install
|
||||
|
||||
# Install neon contrib
|
||||
RUN make MAKELEVEL=0 PG_CONFIG=/pg/compute_build/postgres_bin/bin/pg_config -j $(getconf _NPROCESSORS_ONLN) -C /pg/neon install
|
||||
|
||||
USER postgres
|
||||
WORKDIR /pg
|
||||
|
||||
#
|
||||
# Final compute node image to be exported
|
||||
#
|
||||
FROM debian:buster-slim
|
||||
|
||||
# libreadline-dev is required to run psql
|
||||
RUN apt-get update && apt-get -yq install libreadline-dev libossp-uuid-dev
|
||||
|
||||
# Add user postgres
|
||||
RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
echo "postgres:test_console_pass" | chpasswd && \
|
||||
mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
|
||||
chown -R postgres:postgres /var/db/postgres && \
|
||||
chmod 0750 /var/db/postgres/compute
|
||||
|
||||
# Copy ready Postgres binaries
|
||||
COPY --from=pg-build /pg/compute_build/postgres_bin /usr/local
|
||||
|
||||
# Copy binaries from compute-tools
|
||||
COPY --from=compute-deps /usr/local/bin/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
# XXX: temporary symlink for compatibility with old control-plane
|
||||
RUN ln -s /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
# Add postgres shared objects to the search path
|
||||
RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
USER postgres
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/compute_ctl"]
|
||||
@@ -1,6 +1,10 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
|
||||
FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned AS rust-build
|
||||
ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
|
||||
ARG IMAGE=rust
|
||||
ARG TAG=pinned
|
||||
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS rust-build
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
|
||||
@@ -16,7 +20,7 @@ ARG CACHEPOT_BUCKET=neon-github-dev
|
||||
COPY . .
|
||||
|
||||
RUN set -e \
|
||||
&& mold -run cargo build -p compute_tools --release \
|
||||
&& mold -run cargo build -p compute_tools --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Final image that only has one binary
|
||||
|
||||
26
Makefile
26
Makefile
@@ -51,7 +51,7 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
.PHONY: all
|
||||
all: zenith postgres
|
||||
all: zenith postgres neon-pg-ext
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
@@ -87,25 +87,39 @@ postgres: postgres-configure \
|
||||
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/neon"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
|
||||
+@echo "Compiling contrib/neon_test_utils"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
|
||||
+@echo "Compiling libpq"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq install
|
||||
+@echo "Compiling pg_buffercache"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
|
||||
+@echo "Compiling pageinspect"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install
|
||||
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean:
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect clean
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/interfaces/libpq clean
|
||||
|
||||
neon-pg-ext: postgres
|
||||
+@echo "Compiling neon"
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \
|
||||
-C $(ROOT_PROJECT_DIR)/pgxn/neon install
|
||||
+@echo "Compiling neon_test_utils"
|
||||
$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/bin/pg_config \
|
||||
-C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils install
|
||||
|
||||
.PHONY: neon-pg-ext-clean
|
||||
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon clean
|
||||
$(MAKE) -C $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
cd pgxn/neon && $(MAKE) clean
|
||||
cd pgxn/neon_test_utils && $(MAKE) clean
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
|
||||
@@ -178,6 +178,7 @@ impl ComputeNode {
|
||||
.args(&["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.stdout(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
@@ -187,10 +188,13 @@ impl ComputeNode {
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}",
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}, stderr: {}",
|
||||
sync_output.status,
|
||||
String::from_utf8(sync_output.stdout).expect("postgres --sync-safekeepers exited, and stdout is not utf-8"),
|
||||
String::from_utf8(sync_output.stderr).expect("postgres --sync-safekeepers exited, and stderr is not utf-8"),
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -62,9 +62,16 @@ impl GenericOption {
|
||||
/// Represent `GenericOption` as configuration option.
|
||||
pub fn to_pg_setting(&self) -> String {
|
||||
if let Some(val) = &self.value {
|
||||
let name = match self.name.as_str() {
|
||||
"safekeepers" => "neon.safekeepers",
|
||||
"wal_acceptor_reconnect" => "neon.safekeeper_reconnect_timeout",
|
||||
"wal_acceptor_connect_timeout" => "neon.safekeeper_connect_timeout",
|
||||
it => it,
|
||||
};
|
||||
|
||||
match self.vartype.as_ref() {
|
||||
"string" => format!("{} = '{}'", self.name, val),
|
||||
_ => format!("{} = {}", self.name, val),
|
||||
"string" => format!("{} = '{}'", name, val),
|
||||
_ => format!("{} = {}", name, val),
|
||||
}
|
||||
} else {
|
||||
self.name.to_owned()
|
||||
|
||||
@@ -85,7 +85,7 @@
|
||||
"vartype": "bool"
|
||||
},
|
||||
{
|
||||
"name": "safekeepers",
|
||||
"name": "neon.safekeepers",
|
||||
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
|
||||
"vartype": "string"
|
||||
},
|
||||
@@ -181,7 +181,6 @@
|
||||
}
|
||||
]
|
||||
},
|
||||
|
||||
"delta_operations": [
|
||||
{
|
||||
"action": "delete_db",
|
||||
|
||||
@@ -28,7 +28,7 @@ mod pg_helpers_tests {
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.settings.as_pg_settings(),
|
||||
"fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
|
||||
"fsync = off\nwal_level = replica\nhot_standby = on\nneon.safekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -150,7 +150,7 @@ impl PostgresNode {
|
||||
let port: u16 = conf.parse_field("port", &context)?;
|
||||
let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?;
|
||||
let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?;
|
||||
let uses_wal_proposer = conf.get("safekeepers").is_some();
|
||||
let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
|
||||
|
||||
// parse recovery_target_lsn, if any
|
||||
let recovery_target_lsn: Option<Lsn> =
|
||||
@@ -341,7 +341,7 @@ impl PostgresNode {
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("safekeepers", &safekeepers);
|
||||
conf.append("neon.safekeepers", &safekeepers);
|
||||
} else {
|
||||
// We only use setup without safekeepers for tests,
|
||||
// and don't care about data durability on pageserver,
|
||||
|
||||
@@ -92,6 +92,7 @@ The layer map tracks what layers exist in a timeline.
|
||||
### Layered repository
|
||||
|
||||
Neon repository implementation that keeps data in layers.
|
||||
|
||||
### LSN
|
||||
|
||||
The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
|
||||
@@ -125,6 +126,26 @@ TODO: use this name consistently in remote storage code. Now `disk_consistent_ls
|
||||
* `ancestor_lsn` - LSN of the branch point (the LSN at which this branch was created)
|
||||
|
||||
TODO: add table that describes mapping between PostgreSQL (compute), safekeeper and pageserver LSNs.
|
||||
|
||||
### Logical size
|
||||
|
||||
The pageserver tracks the "logical size" of a timeline. It is the
|
||||
total size of all relations in all Postgres databases on the
|
||||
timeline. It includes all user and system tables, including their FSM
|
||||
and VM forks. But it does not include SLRUs, twophase files or any
|
||||
other such data or metadata that lives outside relations.
|
||||
|
||||
The logical size is calculated by the pageserver, and is sent to
|
||||
PostgreSQL via feedback messages to the safekeepers. PostgreSQL uses
|
||||
the logical size to enforce the size limit in the free tier. The
|
||||
logical size is also shown to users in the web console.
|
||||
|
||||
The logical size is not affected by branches or the physical layout of
|
||||
layer files in the pageserver. If you have a database with 1 GB
|
||||
logical size and you create a branch of it, both branches will have 1
|
||||
GB logical size, even though the branch is copy-on-write and won't
|
||||
consume any extra physical disk space until you make changes to it.
|
||||
|
||||
### Page (block)
|
||||
|
||||
The basic structure used to store relation data. All pages are of the same size.
|
||||
|
||||
@@ -112,11 +112,13 @@ Run `poetry shell` to activate the virtual environment.
|
||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||
|
||||
### Obligatory checks
|
||||
We force code formatting via `yapf` and type hints via `mypy`.
|
||||
Run the following commands in the repository's root (next to `setup.cfg`):
|
||||
We force code formatting via `black`, `isort` and type hints via `mypy`.
|
||||
Run the following commands in the repository's root (next to `pyproject.toml`):
|
||||
|
||||
```bash
|
||||
poetry run yapf -ri . # All code is reformatted
|
||||
poetry run isort . # Imports are reformatted
|
||||
poetry run black . # All code is reformatted
|
||||
poetry run flake8 . # Python linter
|
||||
poetry run mypy . # Ensure there are no typing errors
|
||||
```
|
||||
|
||||
@@ -125,7 +127,7 @@ Otherwise it will not find its configuration.
|
||||
|
||||
Also consider:
|
||||
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Running `pycodestyle` (or a linter of your choice) and fixing possible defects, if any.
|
||||
* Adding more type hints to your code to avoid `Any`.
|
||||
|
||||
### Changing dependencies
|
||||
|
||||
@@ -4,7 +4,6 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = "1.0.1"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL non-relation files.
|
||||
//!
|
||||
use crate::transaction_id_precedes;
|
||||
use super::pg_constants;
|
||||
use crate::transaction_id_precedes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
|
||||
@@ -8,9 +8,9 @@
|
||||
//! to look deeper into the WAL records to also understand which blocks they modify, the code
|
||||
//! for that is in pageserver/src/walrecord.rs
|
||||
//!
|
||||
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC};
|
||||
use super::pg_constants;
|
||||
use super::xlog_utils::*;
|
||||
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
@@ -170,6 +170,7 @@ impl WalStreamDecoder {
|
||||
}
|
||||
State::SkippingEverything { .. } => {}
|
||||
}
|
||||
// now read page contents
|
||||
match &mut self.state {
|
||||
State::WaitingForRecord => {
|
||||
// need to have at least the xl_tot_len field
|
||||
@@ -194,8 +195,8 @@ impl WalStreamDecoder {
|
||||
return Ok(Some(self.complete_record(recordbuf)?));
|
||||
} else {
|
||||
// Need to assemble the record from pieces. Remember the size of the
|
||||
// record, and loop back. On next iteration, we will reach the 'else'
|
||||
// branch below, and copy the part of the record that was on this page
|
||||
// record, and loop back. On next iterations, we will reach the branch
|
||||
// below, and copy the part of the record that was on this or next page(s)
|
||||
// to 'recordbuf'. Subsequent iterations will skip page headers, and
|
||||
// append the continuations from the next pages to 'recordbuf'.
|
||||
self.state = State::ReassemblingRecord {
|
||||
|
||||
@@ -42,19 +42,13 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
pub trait RemoteObjectName {
|
||||
// Needed to retrieve last component for RemoteObjectId.
|
||||
// In other words a file name
|
||||
fn object_name(&self) -> Option<&str>;
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type RemoteObjectId: RemoteObjectName;
|
||||
type RemoteObjectId;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
||||
@@ -71,7 +65,7 @@ pub trait RemoteStorage: Send + Sync {
|
||||
/// so this method doesnt need to.
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
prefix: Option<&Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
@@ -163,6 +157,13 @@ impl GenericRemoteStorage {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_local(&self) -> Option<&LocalFs> {
|
||||
match self {
|
||||
Self::Local(local_fs) => Some(local_fs),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
@@ -18,16 +17,10 @@ use tokio::{
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName};
|
||||
use crate::{path_with_suffix_extension, Download, DownloadError};
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
impl RemoteObjectName for PathBuf {
|
||||
fn object_name(&self) -> Option<&str> {
|
||||
self.file_stem().and_then(|n| n.to_str())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LocalFs {
|
||||
working_directory: PathBuf,
|
||||
storage_root: PathBuf,
|
||||
@@ -113,13 +106,10 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
prefix: Option<&Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(prefix),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
get_all_files(path.as_ref(), false).await
|
||||
let path = prefix.unwrap_or(&self.storage_root);
|
||||
get_all_files(path, false).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -150,8 +140,7 @@ impl RemoteStorage for LocalFs {
|
||||
);
|
||||
|
||||
let from_size_bytes = from_size_bytes as u64;
|
||||
// Require to read 1 byte more than the expected to check later, that the stream and its size match.
|
||||
let mut buffer_to_read = from.take(from_size_bytes + 1);
|
||||
let mut buffer_to_read = from.take(from_size_bytes);
|
||||
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
.await
|
||||
@@ -162,17 +151,15 @@ impl RemoteStorage for LocalFs {
|
||||
)
|
||||
})?;
|
||||
|
||||
if bytes_read < from_size_bytes {
|
||||
bail!("Provided stream was shorter than expected: {bytes_read} vs {from_size_bytes} bytes");
|
||||
}
|
||||
// Check if there is any extra data after the given size.
|
||||
let mut from = buffer_to_read.into_inner();
|
||||
let extra_read = from.read(&mut [1]).await?;
|
||||
ensure!(
|
||||
bytes_read == from_size_bytes,
|
||||
"Provided stream has actual size {} fthat is smaller than the given stream size {}",
|
||||
bytes_read,
|
||||
from_size_bytes
|
||||
);
|
||||
|
||||
ensure!(
|
||||
buffer_to_read.read(&mut [0]).await? == 0,
|
||||
"Provided stream has bigger size than the given stream size {}",
|
||||
from_size_bytes
|
||||
extra_read == 0,
|
||||
"Provided stream was larger than expected: expected {from_size_bytes} bytes",
|
||||
);
|
||||
|
||||
destination.flush().await.with_context(|| {
|
||||
@@ -609,6 +596,34 @@ mod fs_tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file_negatives() -> anyhow::Result<()> {
|
||||
let storage = create_storage()?;
|
||||
|
||||
let id = storage.remote_object_id(&storage.working_directory.join("dummy"))?;
|
||||
let content = std::io::Cursor::new(b"12345");
|
||||
|
||||
// Check that you get an error if the size parameter doesn't match the actual
|
||||
// size of the stream.
|
||||
storage
|
||||
.upload(content.clone(), 0, &id, None)
|
||||
.await
|
||||
.expect_err("upload with zero size succeeded");
|
||||
storage
|
||||
.upload(content.clone(), 4, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too short size succeeded");
|
||||
storage
|
||||
.upload(content.clone(), 6, &id, None)
|
||||
.await
|
||||
.expect_err("upload with too large size succeeded");
|
||||
|
||||
// Correct size is 5, this should succeed.
|
||||
storage.upload(content, 5, &id, None).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
LocalFs::new(tempdir()?.path().to_owned(), tempdir()?.path().to_owned())
|
||||
}
|
||||
|
||||
@@ -19,9 +19,7 @@ use tokio::{io, sync::Semaphore};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{
|
||||
strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config,
|
||||
};
|
||||
use crate::{strip_path_prefix, Download, DownloadError, RemoteStorage, S3Config};
|
||||
|
||||
use super::StorageMetadata;
|
||||
|
||||
@@ -96,6 +94,23 @@ const S3_PREFIX_SEPARATOR: char = '/';
|
||||
pub struct S3ObjectKey(String);
|
||||
|
||||
impl S3ObjectKey {
|
||||
/// Turn a/b/c or a/b/c/ into c
|
||||
pub fn object_name(&self) -> Option<&str> {
|
||||
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
|
||||
// see https://github.com/rust-lang/rust/issues/88674
|
||||
if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
|
||||
return None;
|
||||
}
|
||||
|
||||
if self.0.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
|
||||
} else {
|
||||
self.0
|
||||
.rsplit_once(S3_PREFIX_SEPARATOR)
|
||||
.map(|(_, last)| last)
|
||||
}
|
||||
}
|
||||
|
||||
fn key(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
@@ -119,25 +134,6 @@ impl S3ObjectKey {
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteObjectName for S3ObjectKey {
|
||||
/// Turn a/b/c or a/b/c/ into c
|
||||
fn object_name(&self) -> Option<&str> {
|
||||
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
|
||||
// see https://github.com/rust-lang/rust/issues/88674
|
||||
if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
|
||||
return None;
|
||||
}
|
||||
|
||||
if self.0.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
|
||||
} else {
|
||||
self.0
|
||||
.rsplit_once(S3_PREFIX_SEPARATOR)
|
||||
.map(|(_, last)| last)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
workdir: PathBuf,
|
||||
@@ -316,11 +312,11 @@ impl RemoteStorage for S3Bucket {
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
prefix: Option<&Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
// get the passed prefix or if it is not set use prefix_in_bucket value
|
||||
let list_prefix = prefix
|
||||
.map(|p| p.0)
|
||||
.map(|p| p.0.clone())
|
||||
.or_else(|| self.prefix_in_bucket.clone())
|
||||
.map(|mut p| {
|
||||
// required to end with a separator
|
||||
|
||||
@@ -39,7 +39,7 @@ bytes = "1.0.1"
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
criterion = "0.3"
|
||||
rustls-pemfile = "0.2.1"
|
||||
rustls-pemfile = "1"
|
||||
|
||||
[[bench]]
|
||||
name = "benchmarks"
|
||||
|
||||
@@ -8,6 +8,9 @@ pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
pub mod seqwait;
|
||||
|
||||
/// A simple Read-Copy-Update implementation.
|
||||
pub mod simple_rcu;
|
||||
|
||||
/// append only ordered map implemented with a Vec
|
||||
pub mod vec_map;
|
||||
|
||||
|
||||
@@ -163,14 +163,9 @@ pub fn is_socket_read_timed_out(error: &anyhow::Error) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations)
|
||||
// PG protocol strings are always C strings.
|
||||
fn cstr_to_str(b: &Bytes) -> Result<&str> {
|
||||
let without_null = if b.last() == Some(&0) {
|
||||
&b[..b.len() - 1]
|
||||
} else {
|
||||
&b[..]
|
||||
};
|
||||
// Cast a byte slice to a string slice, dropping null terminator if there's one.
|
||||
fn cstr_to_str(bytes: &[u8]) -> Result<&str> {
|
||||
let without_null = bytes.strip_suffix(&[0]).unwrap_or(bytes);
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
@@ -423,9 +418,9 @@ impl PostgresBackend {
|
||||
self.state = ProtoState::Established;
|
||||
}
|
||||
|
||||
FeMessage::Query(m) => {
|
||||
FeMessage::Query(body) => {
|
||||
// remove null terminator
|
||||
let query_string = cstr_to_str(&m.body)?;
|
||||
let query_string = cstr_to_str(&body)?;
|
||||
|
||||
trace!("got query {:?}", query_string);
|
||||
// xxx distinguish fatal and recoverable errors?
|
||||
|
||||
@@ -7,11 +7,14 @@ use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use postgres_protocol::PG_EPOCH;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::future::Future;
|
||||
use std::io::{self, Cursor};
|
||||
use std::str;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
future::Future,
|
||||
io::{self, Cursor},
|
||||
str,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
use tokio::io::AsyncReadExt;
|
||||
use tracing::{trace, warn};
|
||||
|
||||
@@ -25,8 +28,10 @@ pub const TEXT_OID: Oid = 25;
|
||||
#[derive(Debug)]
|
||||
pub enum FeMessage {
|
||||
StartupPacket(FeStartupPacket),
|
||||
Query(FeQueryMessage), // Simple query
|
||||
Parse(FeParseMessage), // Extended query protocol
|
||||
// Simple query.
|
||||
Query(Bytes),
|
||||
// Extended query protocol.
|
||||
Parse(FeParseMessage),
|
||||
Describe(FeDescribeMessage),
|
||||
Bind(FeBindMessage),
|
||||
Execute(FeExecuteMessage),
|
||||
@@ -51,7 +56,67 @@ pub enum FeStartupPacket {
|
||||
},
|
||||
}
|
||||
|
||||
pub type StartupMessageParams = HashMap<String, String>;
|
||||
#[derive(Debug)]
|
||||
pub struct StartupMessageParams {
|
||||
params: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl StartupMessageParams {
|
||||
/// Get parameter's value by its name.
|
||||
pub fn get(&self, name: &str) -> Option<&str> {
|
||||
self.params.get(name).map(|s| s.as_str())
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// taking into account all escape sequences but leaving them as-is.
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let mut last_was_escape = false;
|
||||
let iter = self
|
||||
.get("options")?
|
||||
.split(move |c: char| {
|
||||
// We split by non-escaped whitespace symbols.
|
||||
let should_split = c.is_ascii_whitespace() && !last_was_escape;
|
||||
last_was_escape = c == '\\' && !last_was_escape;
|
||||
should_split
|
||||
})
|
||||
.filter(|s| !s.is_empty());
|
||||
|
||||
Some(iter)
|
||||
}
|
||||
|
||||
/// Split command-line options according to PostgreSQL's logic,
|
||||
/// applying all escape sequences (using owned strings as needed).
|
||||
/// [`None`] means that there's no `options` in [`Self`].
|
||||
pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
|
||||
// See `postgres: pg_split_opts`.
|
||||
let iter = self.options_raw()?.map(|s| {
|
||||
let mut preserve_next_escape = false;
|
||||
let escape = |c| {
|
||||
// We should remove '\\' unless it's preceded by '\\'.
|
||||
let should_remove = c == '\\' && !preserve_next_escape;
|
||||
preserve_next_escape = should_remove;
|
||||
should_remove
|
||||
};
|
||||
|
||||
match s.contains('\\') {
|
||||
true => Cow::Owned(s.replace(escape, "")),
|
||||
false => Cow::Borrowed(s),
|
||||
}
|
||||
});
|
||||
|
||||
Some(iter)
|
||||
}
|
||||
|
||||
// This function is mostly useful in tests.
|
||||
#[doc(hidden)]
|
||||
pub fn new<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> Self {
|
||||
Self {
|
||||
params: pairs.map(|(k, v)| (k.to_owned(), v.to_owned())).into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
|
||||
pub struct CancelKeyData {
|
||||
@@ -69,11 +134,6 @@ impl Distribution<CancelKeyData> for Standard {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FeQueryMessage {
|
||||
pub body: Bytes,
|
||||
}
|
||||
|
||||
// We only support the simple case of Parse on unnamed prepared statement and
|
||||
// no params
|
||||
#[derive(Debug)]
|
||||
@@ -89,7 +149,7 @@ pub struct FeDescribeMessage {
|
||||
|
||||
// we only support unnamed prepared stmt and portal
|
||||
#[derive(Debug)]
|
||||
pub struct FeBindMessage {}
|
||||
pub struct FeBindMessage;
|
||||
|
||||
// we only support unnamed prepared stmt or portal
|
||||
#[derive(Debug)]
|
||||
@@ -100,7 +160,7 @@ pub struct FeExecuteMessage {
|
||||
|
||||
// we only support unnamed prepared stmt and portal
|
||||
#[derive(Debug)]
|
||||
pub struct FeCloseMessage {}
|
||||
pub struct FeCloseMessage;
|
||||
|
||||
/// Retry a read on EINTR
|
||||
///
|
||||
@@ -163,22 +223,20 @@ impl FeMessage {
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
let len = retry_read!(stream.read_u32().await)?;
|
||||
|
||||
// The message length includes itself, so it better be at least 4
|
||||
let bodylen = len
|
||||
// The message length includes itself, so it better be at least 4.
|
||||
let len = retry_read!(stream.read_u32().await)?
|
||||
.checked_sub(4)
|
||||
.context("invalid message length: parsing u32")?;
|
||||
.context("invalid message length")?;
|
||||
|
||||
// Read message body
|
||||
let mut body_buf: Vec<u8> = vec![0; bodylen as usize];
|
||||
stream.read_exact(&mut body_buf).await?;
|
||||
let body = {
|
||||
let mut buffer = vec![0u8; len as usize];
|
||||
stream.read_exact(&mut buffer).await?;
|
||||
Bytes::from(buffer)
|
||||
};
|
||||
|
||||
let body = Bytes::from(body_buf);
|
||||
|
||||
// Parse it
|
||||
match tag {
|
||||
b'Q' => Ok(Some(FeMessage::Query(FeQueryMessage { body }))),
|
||||
b'Q' => Ok(Some(FeMessage::Query(body))),
|
||||
b'P' => Ok(Some(FeParseMessage::parse(body)?)),
|
||||
b'D' => Ok(Some(FeDescribeMessage::parse(body)?)),
|
||||
b'E' => Ok(Some(FeExecuteMessage::parse(body)?)),
|
||||
@@ -242,9 +300,9 @@ impl FeStartupPacket {
|
||||
stream.read_exact(params_bytes.as_mut()).await?;
|
||||
|
||||
// Parse params depending on request code
|
||||
let most_sig_16_bits = request_code >> 16;
|
||||
let least_sig_16_bits = request_code & ((1 << 16) - 1);
|
||||
let message = match (most_sig_16_bits, least_sig_16_bits) {
|
||||
let req_hi = request_code >> 16;
|
||||
let req_lo = request_code & ((1 << 16) - 1);
|
||||
let message = match (req_hi, req_lo) {
|
||||
(RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
|
||||
ensure!(params_len == 8, "expected 8 bytes for CancelRequest params");
|
||||
let mut cursor = Cursor::new(params_bytes);
|
||||
@@ -253,173 +311,115 @@ impl FeStartupPacket {
|
||||
cancel_key: cursor.read_i32().await?,
|
||||
})
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => FeStartupPacket::SslRequest,
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
|
||||
// Requested upgrade to SSL (aka TLS)
|
||||
FeStartupPacket::SslRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
|
||||
// Requested upgrade to GSSAPI
|
||||
FeStartupPacket::GssEncRequest
|
||||
}
|
||||
(RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
|
||||
bail!("Unrecognized request code {}", unrecognized_code)
|
||||
}
|
||||
// TODO bail if protocol major_version is not 3?
|
||||
(major_version, minor_version) => {
|
||||
// TODO bail if protocol major_version is not 3?
|
||||
// Parse null-terminated (String) pairs of param name / param value
|
||||
let params_str = str::from_utf8(¶ms_bytes).unwrap();
|
||||
let mut params_tokens = params_str.split('\0');
|
||||
let mut params: HashMap<String, String> = HashMap::new();
|
||||
while let Some(name) = params_tokens.next() {
|
||||
let value = params_tokens
|
||||
// Parse pairs of null-terminated strings (key, value).
|
||||
// See `postgres: ProcessStartupPacket, build_startup_packet`.
|
||||
let mut tokens = str::from_utf8(¶ms_bytes)
|
||||
.context("StartupMessage params: invalid utf-8")?
|
||||
.strip_suffix('\0') // drop packet's own null terminator
|
||||
.context("StartupMessage params: missing null terminator")?
|
||||
.split_terminator('\0');
|
||||
|
||||
let mut params = HashMap::new();
|
||||
while let Some(name) = tokens.next() {
|
||||
let value = tokens
|
||||
.next()
|
||||
.context("expected even number of params in StartupMessage")?;
|
||||
if name == "options" {
|
||||
// parsing options arguments "...&options=<var0>%3D<val0>+<var1>=<var1>..."
|
||||
// '%3D' is '=' and '+' is ' '
|
||||
.context("StartupMessage params: key without value")?;
|
||||
|
||||
// Note: we allow users that don't have SNI capabilities,
|
||||
// to pass a special keyword argument 'project'
|
||||
// to be used to determine the cluster name by the proxy.
|
||||
|
||||
//TODO: write unit test for this and refactor in its own function.
|
||||
for cmdopt in value.split(' ') {
|
||||
let nameval: Vec<&str> = cmdopt.split('=').collect();
|
||||
if nameval.len() == 2 {
|
||||
params.insert(nameval[0].to_string(), nameval[1].to_string());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
params.insert(name.to_string(), value.to_string());
|
||||
}
|
||||
params.insert(name.to_owned(), value.to_owned());
|
||||
}
|
||||
|
||||
FeStartupPacket::StartupMessage {
|
||||
major_version,
|
||||
minor_version,
|
||||
params,
|
||||
params: StartupMessageParams { params },
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Some(FeMessage::StartupPacket(message)))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FeParseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
let query_string = read_null_terminated(&mut buf)?;
|
||||
let nparams = buf.get_i16();
|
||||
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
// FIXME: the rust-postgres driver uses a named prepared statement
|
||||
// for copy_out(). We're not prepared to handle that correctly. For
|
||||
// now, just ignore the statement name, assuming that the client never
|
||||
// uses more than one prepared statement at a time.
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Parse",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
if nparams != 0 {
|
||||
bail!("query params not implemented");
|
||||
}
|
||||
let _pstmt_name = read_cstr(&mut buf)?;
|
||||
let query_string = read_cstr(&mut buf)?;
|
||||
let nparams = buf.get_i16();
|
||||
|
||||
ensure!(nparams == 0, "query params not implemented");
|
||||
|
||||
Ok(FeMessage::Parse(FeParseMessage { query_string }))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeDescribeMessage {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let kind = buf.get_u8();
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
let _pstmt_name = read_cstr(&mut buf)?;
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented in Describe",
|
||||
));
|
||||
}
|
||||
*/
|
||||
|
||||
if kind != b'S' {
|
||||
bail!("only prepared statmement Describe is implemented");
|
||||
}
|
||||
ensure!(
|
||||
kind == b'S',
|
||||
"only prepared statemement Describe is implemented"
|
||||
);
|
||||
|
||||
Ok(FeMessage::Describe(FeDescribeMessage { kind }))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeExecuteMessage {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let portal_name = read_cstr(&mut buf)?;
|
||||
let maxrows = buf.get_i32();
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
bail!("named portals not implemented");
|
||||
}
|
||||
|
||||
if maxrows != 0 {
|
||||
bail!("row limit in Execute message not supported");
|
||||
}
|
||||
ensure!(portal_name.is_empty(), "named portals not implemented");
|
||||
ensure!(maxrows == 0, "row limit in Execute message not implemented");
|
||||
|
||||
Ok(FeMessage::Execute(FeExecuteMessage { maxrows }))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeBindMessage {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let portal_name = read_null_terminated(&mut buf)?;
|
||||
let _pstmt_name = read_null_terminated(&mut buf)?;
|
||||
|
||||
if !portal_name.is_empty() {
|
||||
bail!("named portals not implemented");
|
||||
}
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let portal_name = read_cstr(&mut buf)?;
|
||||
let _pstmt_name = read_cstr(&mut buf)?;
|
||||
|
||||
// FIXME: see FeParseMessage::parse
|
||||
/*
|
||||
if !pstmt_name.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"named prepared statements not implemented",
|
||||
));
|
||||
}
|
||||
*/
|
||||
ensure!(portal_name.is_empty(), "named portals not implemented");
|
||||
|
||||
Ok(FeMessage::Bind(FeBindMessage {}))
|
||||
Ok(FeMessage::Bind(FeBindMessage))
|
||||
}
|
||||
}
|
||||
|
||||
impl FeCloseMessage {
|
||||
pub fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
fn parse(mut buf: Bytes) -> anyhow::Result<FeMessage> {
|
||||
let _kind = buf.get_u8();
|
||||
let _pstmt_or_portal_name = read_null_terminated(&mut buf)?;
|
||||
let _pstmt_or_portal_name = read_cstr(&mut buf)?;
|
||||
|
||||
// FIXME: we do nothing with Close
|
||||
|
||||
Ok(FeMessage::Close(FeCloseMessage {}))
|
||||
Ok(FeMessage::Close(FeCloseMessage))
|
||||
}
|
||||
}
|
||||
|
||||
fn read_null_terminated(buf: &mut Bytes) -> anyhow::Result<Bytes> {
|
||||
let mut result = BytesMut::new();
|
||||
|
||||
loop {
|
||||
if !buf.has_remaining() {
|
||||
bail!("no null-terminator in string");
|
||||
}
|
||||
|
||||
let byte = buf.get_u8();
|
||||
|
||||
if byte == 0 {
|
||||
break;
|
||||
}
|
||||
result.put_u8(byte);
|
||||
}
|
||||
Ok(result.freeze())
|
||||
}
|
||||
|
||||
// Backend
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -441,7 +441,7 @@ pub enum BeMessage<'a> {
|
||||
// None means column is NULL
|
||||
DataRow(&'a [Option<&'a [u8]>]),
|
||||
ErrorResponse(&'a str),
|
||||
// single byte - used in response to SSLRequest/GSSENCRequest
|
||||
/// Single byte - used in response to SSLRequest/GSSENCRequest.
|
||||
EncryptionResponse(bool),
|
||||
NoData,
|
||||
ParameterDescription,
|
||||
@@ -554,49 +554,22 @@ pub static SINGLE_COL_ROWDESC: BeMessage = BeMessage::RowDescription(&[RowDescri
|
||||
formatcode: 0,
|
||||
}]);
|
||||
|
||||
// Safe usize -> i32|i16 conversion, from rust-postgres
|
||||
trait FromUsize: Sized {
|
||||
fn from_usize(x: usize) -> Result<Self, io::Error>;
|
||||
}
|
||||
|
||||
macro_rules! from_usize {
|
||||
($t:ty) => {
|
||||
impl FromUsize for $t {
|
||||
#[inline]
|
||||
fn from_usize(x: usize) -> io::Result<$t> {
|
||||
if x > <$t>::max_value() as usize {
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"value too large to transmit",
|
||||
))
|
||||
} else {
|
||||
Ok(x as $t)
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
from_usize!(i32);
|
||||
|
||||
/// Call f() to write body of the message and prepend it with 4-byte len as
|
||||
/// prescribed by the protocol.
|
||||
fn write_body<F>(buf: &mut BytesMut, f: F) -> io::Result<()>
|
||||
where
|
||||
F: FnOnce(&mut BytesMut) -> io::Result<()>,
|
||||
{
|
||||
fn write_body<R>(buf: &mut BytesMut, f: impl FnOnce(&mut BytesMut) -> R) -> R {
|
||||
let base = buf.len();
|
||||
buf.extend_from_slice(&[0; 4]);
|
||||
|
||||
f(buf)?;
|
||||
let res = f(buf);
|
||||
|
||||
let size = i32::from_usize(buf.len() - base)?;
|
||||
let size = i32::try_from(buf.len() - base).expect("message too big to transmit");
|
||||
(&mut buf[base..]).put_slice(&size.to_be_bytes());
|
||||
Ok(())
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
/// Safe write of s into buf as cstring (String in the protocol).
|
||||
pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
|
||||
fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
|
||||
if s.contains(&0) {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
@@ -608,15 +581,11 @@ pub fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Truncate 0 from C string in Bytes and stringify it (returns slice, no allocations)
|
||||
// PG protocol strings are always C strings.
|
||||
fn cstr_to_str(b: &Bytes) -> Result<&str> {
|
||||
let without_null = if b.last() == Some(&0) {
|
||||
&b[..b.len() - 1]
|
||||
} else {
|
||||
&b[..]
|
||||
};
|
||||
std::str::from_utf8(without_null).map_err(|e| e.into())
|
||||
fn read_cstr(buf: &mut Bytes) -> anyhow::Result<Bytes> {
|
||||
let pos = buf.iter().position(|x| *x == 0);
|
||||
let result = buf.split_to(pos.context("missing terminator")?);
|
||||
buf.advance(1); // drop the null terminator
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
impl<'a> BeMessage<'a> {
|
||||
@@ -631,18 +600,14 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_u8(b'R');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_i32(0); // Specifies that the authentication was successful.
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap(); // write into BytesMut can't fail
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::AuthenticationCleartextPassword => {
|
||||
buf.put_u8(b'R');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_i32(3); // Specifies that clear text password is required.
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap(); // write into BytesMut can't fail
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::AuthenticationMD5Password(salt) => {
|
||||
@@ -650,9 +615,7 @@ impl<'a> BeMessage<'a> {
|
||||
write_body(buf, |buf| {
|
||||
buf.put_i32(5); // Specifies that an MD5-encrypted password is required.
|
||||
buf.put_slice(&salt[..]);
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap(); // write into BytesMut can't fail
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::AuthenticationSasl(msg) => {
|
||||
@@ -677,8 +640,7 @@ impl<'a> BeMessage<'a> {
|
||||
}
|
||||
}
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap()
|
||||
})?;
|
||||
}
|
||||
|
||||
BeMessage::BackendKeyData(key_data) => {
|
||||
@@ -686,77 +648,64 @@ impl<'a> BeMessage<'a> {
|
||||
write_body(buf, |buf| {
|
||||
buf.put_i32(key_data.backend_pid);
|
||||
buf.put_i32(key_data.cancel_key);
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::BindComplete => {
|
||||
buf.put_u8(b'2');
|
||||
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
|
||||
write_body(buf, |_| {});
|
||||
}
|
||||
|
||||
BeMessage::CloseComplete => {
|
||||
buf.put_u8(b'3');
|
||||
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
|
||||
write_body(buf, |_| {});
|
||||
}
|
||||
|
||||
BeMessage::CommandComplete(cmd) => {
|
||||
buf.put_u8(b'C');
|
||||
write_body(buf, |buf| {
|
||||
write_cstr(cmd, buf)?;
|
||||
Ok::<_, io::Error>(())
|
||||
})?;
|
||||
write_body(buf, |buf| write_cstr(cmd, buf))?;
|
||||
}
|
||||
|
||||
BeMessage::CopyData(data) => {
|
||||
buf.put_u8(b'd');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_slice(data);
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::CopyDone => {
|
||||
buf.put_u8(b'c');
|
||||
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
|
||||
write_body(buf, |_| {});
|
||||
}
|
||||
|
||||
BeMessage::CopyFail => {
|
||||
buf.put_u8(b'f');
|
||||
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
|
||||
write_body(buf, |_| {});
|
||||
}
|
||||
|
||||
BeMessage::CopyInResponse => {
|
||||
buf.put_u8(b'G');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_u8(1); /* copy_is_binary */
|
||||
buf.put_i16(0); /* numAttributes */
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
buf.put_u8(1); // copy_is_binary
|
||||
buf.put_i16(0); // numAttributes
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::CopyOutResponse => {
|
||||
buf.put_u8(b'H');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_u8(0); /* copy_is_binary */
|
||||
buf.put_i16(0); /* numAttributes */
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
buf.put_u8(0); // copy_is_binary
|
||||
buf.put_i16(0); // numAttributes
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::CopyBothResponse => {
|
||||
buf.put_u8(b'W');
|
||||
write_body(buf, |buf| {
|
||||
// doesn't matter, used only for replication
|
||||
buf.put_u8(0); /* copy_is_binary */
|
||||
buf.put_i16(0); /* numAttributes */
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
buf.put_u8(0); // copy_is_binary
|
||||
buf.put_i16(0); // numAttributes
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::DataRow(vals) => {
|
||||
@@ -771,9 +720,7 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_i32(-1);
|
||||
}
|
||||
}
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
// ErrorResponse is a zero-terminated array of zero-terminated fields.
|
||||
@@ -788,18 +735,17 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_u8(b'E');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_u8(b'S'); // severity
|
||||
write_cstr(&Bytes::from("ERROR"), buf)?;
|
||||
buf.put_slice(b"ERROR\0");
|
||||
|
||||
buf.put_u8(b'C'); // SQLSTATE error code
|
||||
write_cstr(&Bytes::from("CXX000"), buf)?;
|
||||
buf.put_slice(b"CXX000\0");
|
||||
|
||||
buf.put_u8(b'M'); // the message
|
||||
write_cstr(error_msg.as_bytes(), buf)?;
|
||||
|
||||
buf.put_u8(0); // terminator
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
})?;
|
||||
}
|
||||
|
||||
// NoticeResponse has the same format as ErrorResponse. From doc: "The frontend should display the
|
||||
@@ -812,23 +758,22 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_u8(b'N');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_u8(b'S'); // severity
|
||||
write_cstr(&Bytes::from("NOTICE"), buf)?;
|
||||
buf.put_slice(b"NOTICE\0");
|
||||
|
||||
buf.put_u8(b'C'); // SQLSTATE error code
|
||||
write_cstr(&Bytes::from("CXX000"), buf)?;
|
||||
buf.put_slice(b"CXX000\0");
|
||||
|
||||
buf.put_u8(b'M'); // the message
|
||||
write_cstr(error_msg.as_bytes(), buf)?;
|
||||
|
||||
buf.put_u8(0); // terminator
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
})?;
|
||||
}
|
||||
|
||||
BeMessage::NoData => {
|
||||
buf.put_u8(b'n');
|
||||
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
|
||||
write_body(buf, |_| {});
|
||||
}
|
||||
|
||||
BeMessage::EncryptionResponse(should_negotiate) => {
|
||||
@@ -853,9 +798,7 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_u8(b'S');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_slice(&buffer[..cnt]);
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::ParameterDescription => {
|
||||
@@ -863,23 +806,19 @@ impl<'a> BeMessage<'a> {
|
||||
write_body(buf, |buf| {
|
||||
// we don't support params, so always 0
|
||||
buf.put_i16(0);
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::ParseComplete => {
|
||||
buf.put_u8(b'1');
|
||||
write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
|
||||
write_body(buf, |_| {});
|
||||
}
|
||||
|
||||
BeMessage::ReadyForQuery => {
|
||||
buf.put_u8(b'Z');
|
||||
write_body(buf, |buf| {
|
||||
buf.put_u8(b'I');
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::RowDescription(rows) => {
|
||||
@@ -907,9 +846,7 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_u64(body.wal_end);
|
||||
buf.put_i64(body.timestamp);
|
||||
buf.put_slice(body.data);
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
BeMessage::KeepAlive(req) => {
|
||||
@@ -918,10 +855,8 @@ impl<'a> BeMessage<'a> {
|
||||
buf.put_u8(b'k');
|
||||
buf.put_u64(req.sent_ptr);
|
||||
buf.put_i64(req.timestamp);
|
||||
buf.put_u8(if req.request_reply { 1u8 } else { 0u8 });
|
||||
Ok::<_, io::Error>(())
|
||||
})
|
||||
.unwrap();
|
||||
buf.put_u8(if req.request_reply { 1 } else { 0 });
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -968,17 +903,17 @@ impl ReplicationFeedback {
|
||||
// value itself
|
||||
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
||||
buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
|
||||
write_cstr(&Bytes::from("current_timeline_size"), buf)?;
|
||||
buf.put_slice(b"current_timeline_size\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.current_timeline_size);
|
||||
|
||||
write_cstr(&Bytes::from("ps_writelsn"), buf)?;
|
||||
buf.put_slice(b"ps_writelsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.ps_writelsn);
|
||||
write_cstr(&Bytes::from("ps_flushlsn"), buf)?;
|
||||
buf.put_slice(b"ps_flushlsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.ps_flushlsn);
|
||||
write_cstr(&Bytes::from("ps_applylsn"), buf)?;
|
||||
buf.put_slice(b"ps_applylsn\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_u64(self.ps_applylsn);
|
||||
|
||||
@@ -988,7 +923,7 @@ impl ReplicationFeedback {
|
||||
.expect("failed to serialize pg_replytime earlier than PG_EPOCH")
|
||||
.as_micros() as i64;
|
||||
|
||||
write_cstr(&Bytes::from("ps_replytime"), buf)?;
|
||||
buf.put_slice(b"ps_replytime\0");
|
||||
buf.put_i32(8);
|
||||
buf.put_i64(timestamp);
|
||||
Ok(())
|
||||
@@ -998,33 +933,30 @@ impl ReplicationFeedback {
|
||||
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
|
||||
let mut zf = ReplicationFeedback::empty();
|
||||
let nfields = buf.get_u8();
|
||||
let mut i = 0;
|
||||
while i < nfields {
|
||||
i += 1;
|
||||
let key_cstr = read_null_terminated(&mut buf).unwrap();
|
||||
let key = cstr_to_str(&key_cstr).unwrap();
|
||||
match key {
|
||||
"current_timeline_size" => {
|
||||
for _ in 0..nfields {
|
||||
let key = read_cstr(&mut buf).unwrap();
|
||||
match key.as_ref() {
|
||||
b"current_timeline_size" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.current_timeline_size = buf.get_u64();
|
||||
}
|
||||
"ps_writelsn" => {
|
||||
b"ps_writelsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.ps_writelsn = buf.get_u64();
|
||||
}
|
||||
"ps_flushlsn" => {
|
||||
b"ps_flushlsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.ps_flushlsn = buf.get_u64();
|
||||
}
|
||||
"ps_applylsn" => {
|
||||
b"ps_applylsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.ps_applylsn = buf.get_u64();
|
||||
}
|
||||
"ps_replytime" => {
|
||||
b"ps_replytime" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
let raw_time = buf.get_i64();
|
||||
@@ -1037,8 +969,8 @@ impl ReplicationFeedback {
|
||||
_ => {
|
||||
let len = buf.get_i32();
|
||||
warn!(
|
||||
"ReplicationFeedback parse. unknown key {} of len {}. Skip it.",
|
||||
key, len
|
||||
"ReplicationFeedback parse. unknown key {} of len {len}. Skip it.",
|
||||
String::from_utf8_lossy(key.as_ref())
|
||||
);
|
||||
buf.advance(len as usize);
|
||||
}
|
||||
@@ -1084,7 +1016,7 @@ mod tests {
|
||||
*first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
|
||||
}
|
||||
|
||||
write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
|
||||
data.put_slice(b"new_field_one\0");
|
||||
data.put_i32(8);
|
||||
data.put_u64(42);
|
||||
|
||||
@@ -1093,6 +1025,33 @@ mod tests {
|
||||
assert_eq!(zf, zf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_startup_message_params_options_escaped() {
|
||||
fn split_options(params: &StartupMessageParams) -> Vec<Cow<'_, str>> {
|
||||
params
|
||||
.options_escaped()
|
||||
.expect("options are None")
|
||||
.collect()
|
||||
}
|
||||
|
||||
let make_params = |options| StartupMessageParams::new([("options", options)]);
|
||||
|
||||
let params = StartupMessageParams::new([]);
|
||||
assert!(matches!(params.options_escaped(), None));
|
||||
|
||||
let params = make_params("");
|
||||
assert!(split_options(¶ms).is_empty());
|
||||
|
||||
let params = make_params("foo");
|
||||
assert_eq!(split_options(¶ms), ["foo"]);
|
||||
|
||||
let params = make_params(" foo bar ");
|
||||
assert_eq!(split_options(¶ms), ["foo", "bar"]);
|
||||
|
||||
let params = make_params("foo\\ bar \\ \\\\ baz\\ lol");
|
||||
assert_eq!(split_options(¶ms), ["foo bar", " \\", "baz ", "lol"]);
|
||||
}
|
||||
|
||||
// Make sure that `read` is sync/async callable
|
||||
async fn _assert(stream: &mut (impl tokio::io::AsyncRead + Unpin)) {
|
||||
let _ = FeMessage::read(&mut [].as_ref());
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
///
|
||||
/// Async version of 'seqwait.rs'
|
||||
///
|
||||
/// NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
|
||||
///
|
||||
//!
|
||||
//! Async version of 'seqwait.rs'
|
||||
//!
|
||||
//! NOTE: This is currently unused. If you need this, you'll need to uncomment this in lib.rs.
|
||||
//!
|
||||
|
||||
#![warn(missing_docs)]
|
||||
|
||||
|
||||
217
libs/utils/src/simple_rcu.rs
Normal file
217
libs/utils/src/simple_rcu.rs
Normal file
@@ -0,0 +1,217 @@
|
||||
//!
|
||||
//! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
|
||||
//! similar to a lock, but it allows readers to "hold on" to an old value of RCU
|
||||
//! without blocking writers, and allows writing a new values without blocking
|
||||
//! readers. When you update the new value, the new value is immediately visible
|
||||
//! to new readers, but the update waits until all existing readers have
|
||||
//! finishe, so that no one sees the old value anymore.
|
||||
//!
|
||||
//! This implementation isn't wait-free; it uses an RwLock that is held for a
|
||||
//! short duration when the value is read or updated.
|
||||
//!
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use std::ops::Deref;
|
||||
use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::sync::{Mutex, RwLock, RwLockWriteGuard};
|
||||
|
||||
///
|
||||
/// Rcu allows multiple readers to read and hold onto a value without blocking
|
||||
/// (for very long). Storing to the Rcu updates the value, making new readers
|
||||
/// immediately see the new value, but it also waits for all current readers to
|
||||
/// finish.
|
||||
///
|
||||
pub struct Rcu<V> {
|
||||
inner: RwLock<RcuInner<V>>,
|
||||
}
|
||||
|
||||
struct RcuInner<V> {
|
||||
current_cell: Arc<RcuCell<V>>,
|
||||
old_cells: Vec<Weak<RcuCell<V>>>,
|
||||
}
|
||||
|
||||
///
|
||||
/// RcuCell holds one value. It can be the latest one, or an old one.
|
||||
///
|
||||
struct RcuCell<V> {
|
||||
value: V,
|
||||
|
||||
/// A dummy channel. We never send anything to this channel. The point is
|
||||
/// that when the RcuCell is dropped, any cloned Senders will be notified
|
||||
/// that the channel is closed. Updaters can use this to wait out until the
|
||||
/// RcuCell has been dropped, i.e. until the old value is no longer in use.
|
||||
///
|
||||
/// We never do anything with the receiver, we just need to hold onto it so
|
||||
/// that the Senders will be notified when it's dropped. But because it's
|
||||
/// not Sync, we need a Mutex on it.
|
||||
watch: (SyncSender<()>, Mutex<Receiver<()>>),
|
||||
}
|
||||
|
||||
impl<V> RcuCell<V> {
|
||||
fn new(value: V) -> Self {
|
||||
let (watch_sender, watch_receiver) = sync_channel(0);
|
||||
RcuCell {
|
||||
value,
|
||||
watch: (watch_sender, Mutex::new(watch_receiver)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<V> Rcu<V> {
|
||||
/// Create a new `Rcu`, initialized to `starting_val`
|
||||
pub fn new(starting_val: V) -> Self {
|
||||
let inner = RcuInner {
|
||||
current_cell: Arc::new(RcuCell::new(starting_val)),
|
||||
old_cells: Vec::new(),
|
||||
};
|
||||
Self {
|
||||
inner: RwLock::new(inner),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Read current value. Any store() calls will block until the returned
|
||||
/// guard object is dropped.
|
||||
///
|
||||
pub fn read(&self) -> RcuReadGuard<V> {
|
||||
let current_cell = Arc::clone(&self.inner.read().unwrap().current_cell);
|
||||
RcuReadGuard { cell: current_cell }
|
||||
}
|
||||
|
||||
///
|
||||
/// Lock the current value for updating. Returns a guard object that can be
|
||||
/// used to read the current value, and to store a new value.
|
||||
///
|
||||
/// Note: holding the write-guard blocks concurrent readers, so you should
|
||||
/// finish the update and drop the guard quickly!
|
||||
///
|
||||
pub fn write(&self) -> RcuWriteGuard<'_, V> {
|
||||
let inner = self.inner.write().unwrap();
|
||||
RcuWriteGuard { inner }
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Read guard returned by `read`
|
||||
///
|
||||
pub struct RcuReadGuard<V> {
|
||||
cell: Arc<RcuCell<V>>,
|
||||
}
|
||||
|
||||
impl<V> Deref for RcuReadGuard<V> {
|
||||
type Target = V;
|
||||
|
||||
fn deref(&self) -> &V {
|
||||
&self.cell.value
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Read guard returned by `read`
|
||||
///
|
||||
pub struct RcuWriteGuard<'a, V> {
|
||||
inner: RwLockWriteGuard<'a, RcuInner<V>>,
|
||||
}
|
||||
|
||||
impl<'a, V> Deref for RcuWriteGuard<'a, V> {
|
||||
type Target = V;
|
||||
|
||||
fn deref(&self) -> &V {
|
||||
&self.inner.current_cell.value
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, V> RcuWriteGuard<'a, V> {
|
||||
///
|
||||
/// Store a new value. The new value will be written to the Rcu immediately,
|
||||
/// and will be immediately seen by any `read` calls that start afterwards.
|
||||
/// But if there are any readers still holding onto the old value, or any
|
||||
/// even older values, this will await until they have been released.
|
||||
///
|
||||
/// This will drop the write-guard before it starts waiting for the reads to
|
||||
/// finish, so a new write operation can begin before this functio returns.
|
||||
///
|
||||
pub fn store(mut self, new_val: V) {
|
||||
let new_cell = Arc::new(RcuCell::new(new_val));
|
||||
|
||||
let mut watches = Vec::new();
|
||||
{
|
||||
let old = std::mem::replace(&mut self.inner.current_cell, new_cell);
|
||||
self.inner.old_cells.push(Arc::downgrade(&old));
|
||||
|
||||
// cleanup old cells that no longer have any readers, and collect
|
||||
// the watches for any that do.
|
||||
self.inner.old_cells.retain(|weak| {
|
||||
if let Some(cell) = weak.upgrade() {
|
||||
watches.push(cell.watch.0.clone());
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
});
|
||||
}
|
||||
drop(self);
|
||||
|
||||
// after all the old_cells are no longer in use, we're done
|
||||
for w in watches.iter_mut() {
|
||||
// This will block until the Receiver is closed. That happens then
|
||||
// the RcuCell is dropped.
|
||||
#[allow(clippy::single_match)]
|
||||
match w.send(()) {
|
||||
Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
|
||||
Err(_) => {
|
||||
// closed, which means that the cell has been dropped, and
|
||||
// its value is no longer in use
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread::{sleep, spawn};
|
||||
use std::time::Duration;
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let rcu = Arc::new(Rcu::new(1));
|
||||
let log = Arc::new(Mutex::new(Vec::new()));
|
||||
|
||||
let a = rcu.read();
|
||||
assert_eq!(*a, 1);
|
||||
log.lock().unwrap().push("one");
|
||||
|
||||
let (rcu_clone, log_clone) = (Arc::clone(&rcu), Arc::clone(&log));
|
||||
let thread = spawn(move || {
|
||||
log_clone.lock().unwrap().push("store two start");
|
||||
let write_guard = rcu_clone.write();
|
||||
assert_eq!(*write_guard, 1);
|
||||
write_guard.store(2);
|
||||
log_clone.lock().unwrap().push("store two done");
|
||||
});
|
||||
// without this sleep the test can pass on accident if the writer is slow
|
||||
sleep(Duration::from_secs(1));
|
||||
|
||||
// new read should see the new value
|
||||
let b = rcu.read();
|
||||
assert_eq!(*b, 2);
|
||||
|
||||
// old guard still sees the old value
|
||||
assert_eq!(*a, 1);
|
||||
|
||||
// Release the old guard. This lets the store in the thread to finish.
|
||||
log.lock().unwrap().push("release a");
|
||||
drop(a);
|
||||
|
||||
thread.join().unwrap();
|
||||
|
||||
assert_eq!(
|
||||
log.lock().unwrap().as_slice(),
|
||||
&["one", "store two start", "release a", "store two done",]
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -15,7 +15,7 @@ failpoints = ["fail/failpoints"]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
bytes = "1.0.1"
|
||||
byteorder = "1.4.3"
|
||||
futures = "0.3.13"
|
||||
hex = "0.4.3"
|
||||
|
||||
@@ -22,8 +22,8 @@ use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::DatadirTimeline;
|
||||
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName};
|
||||
@@ -36,13 +36,12 @@ use utils::lsn::Lsn;
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a, W, T>
|
||||
pub struct Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
T: DatadirTimeline,
|
||||
{
|
||||
ar: Builder<AbortableWrite<W>>,
|
||||
timeline: &'a Arc<T>,
|
||||
timeline: &'a Arc<Timeline>,
|
||||
pub lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
full_backup: bool,
|
||||
@@ -57,18 +56,17 @@ where
|
||||
// * When working without safekeepers. In this situation it is important to match the lsn
|
||||
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||
// to start the replication.
|
||||
impl<'a, W, T> Basebackup<'a, W, T>
|
||||
impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
T: DatadirTimeline,
|
||||
{
|
||||
pub fn new(
|
||||
write: W,
|
||||
timeline: &'a Arc<T>,
|
||||
timeline: &'a Arc<Timeline>,
|
||||
req_lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
) -> Result<Basebackup<'a, W, T>> {
|
||||
) -> Result<Basebackup<'a, W>> {
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
@@ -404,10 +402,9 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W, T> Drop for Basebackup<'a, W, T>
|
||||
impl<'a, W> Drop for Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
T: DatadirTimeline,
|
||||
{
|
||||
/// If the basebackup was not finished, prevent the Archive::drop() from
|
||||
/// writing the end-of-archive marker.
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use std::{env, ops::ControlFlow, path::Path, str::FromStr};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use std::{env, ops::ControlFlow, path::Path, str::FromStr, sync::Arc};
|
||||
use tracing::*;
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
@@ -298,7 +299,14 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
let remote_index = tenant_mgr::init_tenant_mgr(conf)?;
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?
|
||||
.map(Arc::new);
|
||||
let remote_index = tenant_mgr::init_tenant_mgr(conf, remote_storage.as_ref().map(Arc::clone))?;
|
||||
|
||||
// Spawn a new thread for the http endpoint
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
@@ -310,7 +318,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"http_endpoint_thread",
|
||||
true,
|
||||
move || {
|
||||
let router = http::make_router(conf, auth_cloned, remote_index)?;
|
||||
let router = http::make_router(conf, auth_cloned, remote_index, remote_storage)?;
|
||||
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
|
||||
},
|
||||
)?;
|
||||
|
||||
@@ -129,9 +129,9 @@ pub struct LocalTimelineInfo {
|
||||
pub latest_gc_cutoff_lsn: Lsn,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
pub current_logical_size: Option<usize>, // is None when timeline is Unloaded
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_logical_size_non_incremental: Option<usize>,
|
||||
pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
|
||||
pub current_logical_size_non_incremental: Option<u64>,
|
||||
pub current_physical_size_non_incremental: Option<u64>,
|
||||
pub timeline_state: LocalTimelineState,
|
||||
|
||||
@@ -150,6 +150,9 @@ pub struct RemoteTimelineInfo {
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
///
|
||||
/// This represents the output of the "timeline_detail" API call.
|
||||
///
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct TimelineInfo {
|
||||
|
||||
@@ -11,10 +11,8 @@ use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
|
||||
TimelineCreateRequest,
|
||||
};
|
||||
use crate::layered_repository::{metadata::TimelineMetadata, LayeredTimeline};
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::layered_repository::{metadata::TimelineMetadata, Timeline};
|
||||
use crate::repository::{LocalTimelineState, RepositoryTimeline};
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
@@ -37,7 +35,7 @@ struct State {
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
remote_storage: Option<Arc<GenericRemoteStorage>>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -45,20 +43,12 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
remote_storage: Option<Arc<GenericRemoteStorage>>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
// Note that this remote storage is created separately from the main one in the sync_loop.
|
||||
// It's fine since it's stateless and some code duplication saves us from bloating the code around with generics.
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
auth,
|
||||
@@ -85,7 +75,7 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
|
||||
// Helper functions to construct a LocalTimelineInfo struct for a timeline
|
||||
|
||||
fn local_timeline_info_from_loaded_timeline(
|
||||
timeline: &LayeredTimeline,
|
||||
timeline: &Timeline,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> anyhow::Result<LocalTimelineInfo> {
|
||||
@@ -160,7 +150,7 @@ fn local_timeline_info_from_unloaded_timeline(metadata: &TimelineMetadata) -> Lo
|
||||
}
|
||||
|
||||
fn local_timeline_info_from_repo_timeline(
|
||||
repo_timeline: &RepositoryTimeline<LayeredTimeline>,
|
||||
repo_timeline: &RepositoryTimeline<Timeline>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
include_non_incremental_physical_size: bool,
|
||||
) -> anyhow::Result<LocalTimelineInfo> {
|
||||
@@ -208,7 +198,6 @@ async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiErr
|
||||
async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let new_timeline_info = tokio::task::spawn_blocking(move || {
|
||||
@@ -246,11 +235,12 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
|
||||
async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let include_non_incremental_logical_size =
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
let include_non_incremental_physical_size =
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let local_timeline_infos = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
list_local_timelines(
|
||||
@@ -301,13 +291,12 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
|
||||
|
||||
async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size =
|
||||
query_param_present(&request, "include-non-incremental-logical-size");
|
||||
let include_non_incremental_physical_size =
|
||||
query_param_present(&request, "include-non-incremental-physical-size");
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let (local_timeline_info, remote_timeline_info) = async {
|
||||
// any error here will render local timeline as None
|
||||
@@ -371,7 +360,7 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
info!("Handling tenant attach {}", tenant_id,);
|
||||
info!("Handling tenant attach {}", tenant_id);
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_tenant_state(tenant_id).is_some() {
|
||||
@@ -451,16 +440,8 @@ async fn gather_tenant_timelines_index_parts(
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<Option<Vec<(ZTimelineId, RemoteTimeline)>>> {
|
||||
let index_parts = match state.remote_storage.as_ref() {
|
||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||
storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id)
|
||||
.await
|
||||
}
|
||||
// FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones
|
||||
// because it is a different instance. We can move this limit to some global static
|
||||
// or use one instance everywhere.
|
||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||
storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id)
|
||||
.await
|
||||
Some(storage) => {
|
||||
storage_sync::gather_tenant_timelines_index_parts(state.conf, storage, tenant_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
@@ -480,9 +461,8 @@ async fn gather_tenant_timelines_index_parts(
|
||||
|
||||
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
tokio::task::spawn_blocking(move || {
|
||||
@@ -521,7 +501,6 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
@@ -589,7 +568,6 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
}
|
||||
|
||||
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
@@ -658,7 +636,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantConfigRequest = json_request(&mut request).await?;
|
||||
let tenant_id = request_data.tenant_id;
|
||||
// check for management permission
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let mut tenant_conf: TenantConfOpt = Default::default();
|
||||
@@ -721,6 +698,7 @@ pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
remote_storage: Option<Arc<GenericRemoteStorage>>,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
@@ -737,7 +715,8 @@ pub fn make_router(
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_index).context("Failed to initialize router state")?,
|
||||
State::new(conf, auth, remote_index, remote_storage)
|
||||
.context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
|
||||
@@ -11,6 +11,7 @@ use bytes::Bytes;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::walingest::WalIngest;
|
||||
@@ -39,9 +40,9 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
|
||||
/// This is currently only used to import a cluster freshly created by initdb.
|
||||
/// The code that deals with the checkpoint would not work right if the
|
||||
/// cluster was not shut down cleanly.
|
||||
pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
tline: &T,
|
||||
tline: &Timeline,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
@@ -99,8 +100,8 @@ pub fn import_timeline_from_postgres_datadir<T: DatadirTimeline>(
|
||||
}
|
||||
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_rel<T: DatadirTimeline, Reader: Read>(
|
||||
modification: &mut DatadirModification<T>,
|
||||
fn import_rel<Reader: Read>(
|
||||
modification: &mut DatadirModification,
|
||||
path: &Path,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
@@ -178,8 +179,8 @@ fn import_rel<T: DatadirTimeline, Reader: Read>(
|
||||
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru<T: DatadirTimeline, Reader: Read>(
|
||||
modification: &mut DatadirModification<T>,
|
||||
fn import_slru<Reader: Read>(
|
||||
modification: &mut DatadirModification,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
mut reader: Reader,
|
||||
@@ -234,12 +235,7 @@ fn import_slru<T: DatadirTimeline, Reader: Read>(
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal<T: DatadirTimeline>(
|
||||
walpath: &Path,
|
||||
tline: &T,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
) -> Result<()> {
|
||||
fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
@@ -305,12 +301,12 @@ fn import_wal<T: DatadirTimeline>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
tline: &T,
|
||||
pub fn import_basebackup_from_tar<Reader: Read>(
|
||||
tline: &Timeline,
|
||||
reader: Reader,
|
||||
base_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
info!("importing base at {}", base_lsn);
|
||||
info!("importing base at {base_lsn}");
|
||||
let mut modification = tline.begin_modification(base_lsn);
|
||||
modification.init_empty()?;
|
||||
|
||||
@@ -335,7 +331,11 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
debug!("directory {:?}", file_path);
|
||||
}
|
||||
_ => {
|
||||
panic!("tar::EntryType::?? {}", file_path.display());
|
||||
bail!(
|
||||
"entry {} in backup tar archive is of unexpected type: {:?}",
|
||||
file_path.display(),
|
||||
header.entry_type()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -347,8 +347,8 @@ pub fn import_basebackup_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
tline: &T,
|
||||
pub fn import_wal_from_tar<Reader: Read>(
|
||||
tline: &Timeline,
|
||||
reader: Reader,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
@@ -388,7 +388,11 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
continue;
|
||||
}
|
||||
_ => {
|
||||
panic!("tar::EntryType::?? {}", file_path.display());
|
||||
bail!(
|
||||
"entry {} in WAL tar archive is of unexpected type: {:?}",
|
||||
file_path.display(),
|
||||
header.entry_type()
|
||||
);
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -428,14 +432,12 @@ pub fn import_wal_from_tar<T: DatadirTimeline, Reader: Read>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_file<T: DatadirTimeline, Reader: Read>(
|
||||
modification: &mut DatadirModification<T>,
|
||||
fn import_file<Reader: Read>(
|
||||
modification: &mut DatadirModification,
|
||||
file_path: &Path,
|
||||
reader: Reader,
|
||||
len: usize,
|
||||
) -> Result<Option<ControlFileData>> {
|
||||
debug!("looking at {:?}", file_path);
|
||||
|
||||
if file_path.starts_with("global") {
|
||||
let spcnode = pg_constants::GLOBALTABLESPACE_OID;
|
||||
let dbnode = 0;
|
||||
@@ -557,7 +559,10 @@ pub fn import_file<T: DatadirTimeline, Reader: Read>(
|
||||
// this to import arbitrary postgres databases.
|
||||
bail!("Importing pg_tblspc is not implemented");
|
||||
} else {
|
||||
debug!("ignored");
|
||||
debug!(
|
||||
"ignoring unrecognized file \"{}\" in tar archive",
|
||||
file_path.display()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use tracing::*;
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::collections::hash_map::Entry;
|
||||
@@ -31,7 +32,8 @@ use crate::config::PageServerConf;
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline};
|
||||
use crate::repository::{GcResult, RepositoryTimeline};
|
||||
use crate::tenant_mgr::LocalTimelineUpdate;
|
||||
use crate::thread_mgr;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
@@ -61,13 +63,13 @@ mod timeline;
|
||||
use storage_layer::Layer;
|
||||
use timeline::LayeredTimelineEntry;
|
||||
|
||||
pub use timeline::LayeredTimeline;
|
||||
pub use timeline::Timeline;
|
||||
|
||||
// re-export this function so that page_cache.rs can use it.
|
||||
pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
|
||||
// re-export for use in storage_sync.rs
|
||||
pub use crate::layered_repository::timeline::save_metadata;
|
||||
pub use crate::layered_repository::metadata::save_metadata;
|
||||
|
||||
// re-export for use in walreceiver
|
||||
pub use crate::layered_repository::timeline::WalReceiverInfo;
|
||||
@@ -78,7 +80,7 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
///
|
||||
/// Repository consists of multiple timelines. Keep them in a hash table.
|
||||
///
|
||||
pub struct LayeredRepository {
|
||||
pub struct Repository {
|
||||
// Global pageserver config parameters
|
||||
pub conf: &'static PageServerConf,
|
||||
|
||||
@@ -119,17 +121,22 @@ pub struct LayeredRepository {
|
||||
upload_layers: bool,
|
||||
}
|
||||
|
||||
/// Public interface
|
||||
impl Repository for LayeredRepository {
|
||||
type Timeline = LayeredTimeline;
|
||||
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Self::Timeline>> {
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
self.get_timeline_internal(timelineid, &timelines)
|
||||
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
impl Repository {
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
/// This function is idempotent. It doesn't change internal state in any way.
|
||||
pub fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Timeline>> {
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
.get(&timelineid)
|
||||
.cloned()
|
||||
.map(RepositoryTimeline::from)
|
||||
}
|
||||
|
||||
fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<LayeredTimeline>> {
|
||||
/// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded.
|
||||
pub fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
match self.get_timeline_load_internal(timelineid, &mut timelines)? {
|
||||
Some(local_loaded_timeline) => Ok(local_loaded_timeline),
|
||||
@@ -140,7 +147,9 @@ impl Repository for LayeredRepository {
|
||||
}
|
||||
}
|
||||
|
||||
fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Self::Timeline>)> {
|
||||
/// Lists timelines the repository contains.
|
||||
/// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
|
||||
pub fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Timeline>)> {
|
||||
self.timelines
|
||||
.lock()
|
||||
.unwrap()
|
||||
@@ -154,11 +163,13 @@ impl Repository for LayeredRepository {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn create_empty_timeline(
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||
pub fn create_empty_timeline(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Result<Arc<LayeredTimeline>> {
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let vacant_timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(_) => bail!("Timeline already exists"),
|
||||
@@ -174,9 +185,9 @@ impl Repository for LayeredRepository {
|
||||
crashsafe_dir::create_dir_all(timeline_path)?;
|
||||
|
||||
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
|
||||
timeline::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?;
|
||||
save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?;
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
let timeline = Timeline::new(
|
||||
self.conf,
|
||||
Arc::clone(&self.tenant_conf),
|
||||
metadata,
|
||||
@@ -192,11 +203,16 @@ impl Repository for LayeredRepository {
|
||||
let timeline = Arc::new(timeline);
|
||||
vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)));
|
||||
|
||||
crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach {
|
||||
id: ZTenantTimelineId::new(self.tenant_id(), timeline_id),
|
||||
timeline: Arc::clone(&timeline),
|
||||
});
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(
|
||||
pub fn branch_timeline(
|
||||
&self,
|
||||
src: ZTimelineId,
|
||||
dst: ZTimelineId,
|
||||
@@ -238,7 +254,8 @@ impl Repository for LayeredRepository {
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {latest_gc_cutoff_lsn}"
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn
|
||||
))?;
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
@@ -274,11 +291,11 @@ impl Repository for LayeredRepository {
|
||||
dst_prev,
|
||||
Some(src),
|
||||
start_lsn,
|
||||
*src_timeline.latest_gc_cutoff_lsn.read().unwrap(),
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(),
|
||||
src_timeline.initdb_lsn,
|
||||
);
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?;
|
||||
timeline::save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?;
|
||||
save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?;
|
||||
timelines.insert(dst, LayeredTimelineEntry::Unloaded { id: dst, metadata });
|
||||
|
||||
info!("branched timeline {} from {} at {}", dst, src, start_lsn);
|
||||
@@ -286,10 +303,16 @@ impl Repository for LayeredRepository {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Public entry point to GC. All the logic is in the private
|
||||
/// gc_iteration_internal function, this public facade just wraps it for
|
||||
/// metrics collection.
|
||||
fn gc_iteration(
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
/// this function is periodically called by gc thread.
|
||||
/// also it can be explicitly requested through page server api 'do_gc' command.
|
||||
///
|
||||
/// 'timelineid' specifies the timeline to GC, or None for all.
|
||||
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
|
||||
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
|
||||
/// to make tests more deterministic.
|
||||
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
|
||||
pub fn gc_iteration(
|
||||
&self,
|
||||
target_timeline_id: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
@@ -307,7 +330,11 @@ impl Repository for LayeredRepository {
|
||||
})
|
||||
}
|
||||
|
||||
fn compaction_iteration(&self) -> Result<()> {
|
||||
/// Perform one compaction iteration.
|
||||
/// This function is periodically called by compactor thread.
|
||||
/// Also it can be explicitly requested per timeline through page server
|
||||
/// api's 'compact' command.
|
||||
pub fn compaction_iteration(&self) -> Result<()> {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// compactions. We don't want to block everything else while the
|
||||
@@ -335,12 +362,11 @@ impl Repository for LayeredRepository {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Flush all in-memory data to disk.
|
||||
///
|
||||
/// Used at shutdown.
|
||||
/// Used at graceful shutdown.
|
||||
///
|
||||
fn checkpoint(&self) -> Result<()> {
|
||||
pub fn checkpoint(&self) -> Result<()> {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// checkpoints. We don't want to block everything else while the
|
||||
@@ -370,7 +396,8 @@ impl Repository for LayeredRepository {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
/// Removes timeline-related in-memory data
|
||||
pub fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
// in order to be retriable detach needs to be idempotent
|
||||
// (or at least to a point that each time the detach is called it can make progress)
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
@@ -407,7 +434,9 @@ impl Repository for LayeredRepository {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
|
||||
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
|
||||
/// See [`crate::remote_storage`] for more details about the synchronization.
|
||||
pub fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
|
||||
debug!("attach timeline_id: {}", timeline_id,);
|
||||
match self.timelines.lock().unwrap().entry(timeline_id) {
|
||||
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
|
||||
@@ -421,13 +450,14 @@ impl Repository for LayeredRepository {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_remote_index(&self) -> &RemoteIndex {
|
||||
/// Allows to retrieve remote timeline index from the tenant. Used in walreceiver to grab remote consistent lsn.
|
||||
pub fn get_remote_index(&self) -> &RemoteIndex {
|
||||
&self.remote_index
|
||||
}
|
||||
}
|
||||
|
||||
/// Private functions
|
||||
impl LayeredRepository {
|
||||
impl Repository {
|
||||
pub fn get_checkpoint_distance(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
@@ -517,49 +547,37 @@ impl LayeredRepository {
|
||||
|
||||
tenant_conf.update(&new_tenant_conf);
|
||||
|
||||
LayeredRepository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?;
|
||||
Repository::persist_tenant_config(self.conf, self.tenant_id, *tenant_conf)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Implementation of the public `get_timeline` function.
|
||||
// Differences from the public:
|
||||
// * interface in that the caller must already hold the mutex on the 'timelines' hashmap.
|
||||
fn get_timeline_internal(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
timelines: &HashMap<ZTimelineId, LayeredTimelineEntry>,
|
||||
) -> Option<LayeredTimelineEntry> {
|
||||
timelines.get(&timelineid).cloned()
|
||||
}
|
||||
|
||||
// Implementation of the public `get_timeline_load` function.
|
||||
// Differences from the public:
|
||||
// * interface in that the caller must already hold the mutex on the 'timelines' hashmap.
|
||||
fn get_timeline_load_internal(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
timeline_id: ZTimelineId,
|
||||
timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
|
||||
) -> anyhow::Result<Option<Arc<LayeredTimeline>>> {
|
||||
match timelines.get(&timelineid) {
|
||||
) -> anyhow::Result<Option<Arc<Timeline>>> {
|
||||
match timelines.get(&timeline_id) {
|
||||
Some(entry) => match entry {
|
||||
LayeredTimelineEntry::Loaded(local_timeline) => {
|
||||
debug!("timeline {} found loaded into memory", &timelineid);
|
||||
debug!("timeline {timeline_id} found loaded into memory");
|
||||
return Ok(Some(Arc::clone(local_timeline)));
|
||||
}
|
||||
LayeredTimelineEntry::Unloaded { .. } => {}
|
||||
},
|
||||
None => {
|
||||
debug!("timeline {} not found", &timelineid);
|
||||
debug!("timeline {timeline_id} not found");
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
debug!(
|
||||
"timeline {} found on a local disk, but not loaded into the memory, loading",
|
||||
&timelineid
|
||||
"timeline {timeline_id} found on a local disk, but not loaded into the memory, loading"
|
||||
);
|
||||
let timeline = self.load_local_timeline(timelineid, timelines)?;
|
||||
let timeline = self.load_local_timeline(timeline_id, timelines)?;
|
||||
let was_loaded = timelines.insert(
|
||||
timelineid,
|
||||
timeline_id,
|
||||
LayeredTimelineEntry::Loaded(Arc::clone(&timeline)),
|
||||
);
|
||||
ensure!(
|
||||
@@ -574,7 +592,7 @@ impl LayeredRepository {
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
timelines: &mut HashMap<ZTimelineId, LayeredTimelineEntry>,
|
||||
) -> anyhow::Result<Arc<LayeredTimeline>> {
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id)
|
||||
.context("failed to load metadata")?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
@@ -591,7 +609,7 @@ impl LayeredRepository {
|
||||
.map(LayeredTimelineEntry::Loaded);
|
||||
let _enter = info_span!("loading local timeline").entered();
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
let timeline = Timeline::new(
|
||||
self.conf,
|
||||
Arc::clone(&self.tenant_conf),
|
||||
metadata,
|
||||
@@ -605,7 +623,14 @@ impl LayeredRepository {
|
||||
.load_layer_map(disk_consistent_lsn)
|
||||
.context("failed to load layermap")?;
|
||||
|
||||
Ok(Arc::new(timeline))
|
||||
let timeline = Arc::new(timeline);
|
||||
|
||||
crate::tenant_mgr::try_send_timeline_update(LocalTimelineUpdate::Attach {
|
||||
id: ZTenantTimelineId::new(self.tenant_id(), timeline_id),
|
||||
timeline: Arc::clone(&timeline),
|
||||
});
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
@@ -615,8 +640,8 @@ impl LayeredRepository {
|
||||
tenant_id: ZTenantId,
|
||||
remote_index: RemoteIndex,
|
||||
upload_layers: bool,
|
||||
) -> LayeredRepository {
|
||||
LayeredRepository {
|
||||
) -> Repository {
|
||||
Repository {
|
||||
tenant_id,
|
||||
file_lock: RwLock::new(()),
|
||||
conf,
|
||||
@@ -632,9 +657,9 @@ impl LayeredRepository {
|
||||
/// Locate and load config
|
||||
pub fn load_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<TenantConfOpt> {
|
||||
let target_config_path = TenantConf::path(conf, tenantid);
|
||||
let target_config_path = TenantConf::path(conf, tenant_id);
|
||||
|
||||
info!("load tenantconf from {}", target_config_path.display());
|
||||
|
||||
@@ -669,11 +694,11 @@ impl LayeredRepository {
|
||||
|
||||
pub fn persist_tenant_config(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
tenant_id: ZTenantId,
|
||||
tenant_conf: TenantConfOpt,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("saving tenantconf").entered();
|
||||
let target_config_path = TenantConf::path(conf, tenantid);
|
||||
let target_config_path = TenantConf::path(conf, tenant_id);
|
||||
info!("save tenantconf to {}", target_config_path.display());
|
||||
|
||||
let mut conf_content = r#"# This file contains a specific per-tenant's config.
|
||||
@@ -810,7 +835,7 @@ impl LayeredRepository {
|
||||
// compaction (both require `layer_removal_cs` lock),
|
||||
// but the GC iteration can run concurrently with branch creation.
|
||||
//
|
||||
// See comments in [`LayeredRepository::branch_timeline`] for more information
|
||||
// See comments in [`Repository::branch_timeline`] for more information
|
||||
// about why branch creation task can run concurrently with timeline's GC iteration.
|
||||
for timeline in gc_timelines {
|
||||
if thread_mgr::is_shutdown_requested() {
|
||||
@@ -886,22 +911,525 @@ pub fn load_metadata(
|
||||
})
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that are specific to the layered storage format.
|
||||
///
|
||||
/// There are more unit tests in repository.rs that work through the
|
||||
/// Repository interface and are expected to work regardless of the
|
||||
/// file format and directory layout. The test here are more low level.
|
||||
///
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub mod repo_harness {
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::{fs, path::PathBuf};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::Repository,
|
||||
repository::Key,
|
||||
walrecord::ZenithWalRecord,
|
||||
walredo::{WalRedoError, WalRedoManager},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use hex_literal::hex;
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
pub const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
pub const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
pub fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(64, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
|
||||
|
||||
impl From<TenantConf> for TenantConfOpt {
|
||||
fn from(tenant_conf: TenantConf) -> Self {
|
||||
Self {
|
||||
checkpoint_distance: Some(tenant_conf.checkpoint_distance),
|
||||
checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
|
||||
compaction_target_size: Some(tenant_conf.compaction_target_size),
|
||||
compaction_period: Some(tenant_conf.compaction_period),
|
||||
compaction_threshold: Some(tenant_conf.compaction_threshold),
|
||||
gc_horizon: Some(tenant_conf.gc_horizon),
|
||||
gc_period: Some(tenant_conf.gc_period),
|
||||
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
|
||||
pitr_interval: Some(tenant_conf.pitr_interval),
|
||||
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
|
||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RepoHarness<'a> {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: ZTenantId,
|
||||
|
||||
pub lock_guard: (
|
||||
Option<RwLockReadGuard<'a, ()>>,
|
||||
Option<RwLockWriteGuard<'a, ()>>,
|
||||
),
|
||||
}
|
||||
|
||||
impl<'a> RepoHarness<'a> {
|
||||
pub fn create(test_name: &'static str) -> Result<Self> {
|
||||
Self::create_internal(test_name, false)
|
||||
}
|
||||
pub fn create_exclusive(test_name: &'static str) -> Result<Self> {
|
||||
Self::create_internal(test_name, true)
|
||||
}
|
||||
fn create_internal(test_name: &'static str, exclusive: bool) -> Result<Self> {
|
||||
let lock_guard = if exclusive {
|
||||
(None, Some(LOCK.write().unwrap()))
|
||||
} else {
|
||||
(Some(LOCK.read().unwrap()), None)
|
||||
};
|
||||
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenant_conf = TenantConf::dummy_conf();
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
lock_guard,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Repository {
|
||||
self.try_load().expect("failed to load test repo")
|
||||
}
|
||||
|
||||
pub fn try_load(&self) -> Result<Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
let repo = Repository::new(
|
||||
self.conf,
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
RemoteIndex::default(),
|
||||
false,
|
||||
);
|
||||
// populate repo with locally available timelines
|
||||
for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
|
||||
.expect("should be able to read timelines dir")
|
||||
{
|
||||
let timeline_dir_entry = timeline_dir_entry.unwrap();
|
||||
let timeline_id: ZTimelineId = timeline_dir_entry
|
||||
.path()
|
||||
.file_name()
|
||||
.unwrap()
|
||||
.to_string_lossy()
|
||||
.parse()
|
||||
.unwrap();
|
||||
|
||||
repo.attach_timeline(timeline_id)?;
|
||||
}
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
pub struct TestRedoManager;
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} to get to {}, with {} and {} records",
|
||||
key,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::metadata::METADATA_FILE_NAME;
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::repo_harness::*;
|
||||
use crate::layered_repository::repo_harness::*;
|
||||
use crate::repository::{Key, Value};
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
|
||||
|
||||
#[test]
|
||||
fn test_basic() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_basic")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_duplicate_timelines() -> Result<()> {
|
||||
let repo = RepoHarness::create("no_duplicate_timelines")?.load();
|
||||
let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
|
||||
Ok(_) => panic!("duplicate timeline creation should fail"),
|
||||
Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
pub fn test_value(s: &str) -> Value {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
Value::Image(buf.freeze())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
use std::str::from_utf8;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
|
||||
|
||||
// Insert a value on the timeline
|
||||
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
|
||||
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
|
||||
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
|
||||
writer.finish_write(Lsn(0x40));
|
||||
|
||||
//assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
let new_writer = newtline.writer();
|
||||
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
|
||||
new_writer.finish_write(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
|
||||
"foo at 0x40"
|
||||
);
|
||||
assert_eq!(
|
||||
from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
|
||||
"bar at 0x40"
|
||||
);
|
||||
assert_eq!(
|
||||
from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
|
||||
"foobar at 0x20"
|
||||
);
|
||||
|
||||
//assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_some_layers(tline: &Timeline, start_lsn: Lsn) -> Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
let writer = tline.writer();
|
||||
// Create a relation on the timeline
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
|
||||
// and compaction works. But it does set the 'cutoff' point so that the cross check
|
||||
// below should fail.
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("we might've already garbage collected needed data"))
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
||||
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(&err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("is earlier than latest GC horizon"));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
// FIXME: This currently fails to error out. Calling GC doesn't currently
|
||||
// remove the old value, we'd need to work a little harder
|
||||
#[test]
|
||||
fn test_prohibit_get_for_garbage_collected_data() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
|
||||
.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
|
||||
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
|
||||
match tline.get(*TEST_KEY, Lsn(0x25)) {
|
||||
Ok(_) => panic!("request for page should have failed"),
|
||||
Err(err) => assert!(err.to_string().contains("not found at")),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
*/
|
||||
|
||||
#[test]
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
|
||||
// run gc on parent
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x50))?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load() -> Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
{
|
||||
let repo = harness.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
}
|
||||
|
||||
let repo = harness.load();
|
||||
let tline = repo
|
||||
.get_timeline(TIMELINE_ID)
|
||||
.expect("cannot load timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
|
||||
|
||||
assert!(repo.get_timeline_load(TIMELINE_ID).is_ok());
|
||||
|
||||
let tline = repo
|
||||
.get_timeline(TIMELINE_ID)
|
||||
.expect("cannot load timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load_with_ancestor() -> Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
// create two timelines
|
||||
{
|
||||
let repo = harness.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
}
|
||||
|
||||
// check that both of them are initially unloaded
|
||||
let repo = harness.load();
|
||||
{
|
||||
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
|
||||
|
||||
let tline = repo
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
|
||||
}
|
||||
// load only child timeline
|
||||
let _ = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("cannot load timeline");
|
||||
|
||||
// check that both, child and ancestor are loaded
|
||||
let tline = repo
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
|
||||
|
||||
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
@@ -940,22 +1468,13 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Target file size in the unit tests. In production, the target
|
||||
// file size is much larger, maybe 1 GB. But a small size makes it
|
||||
// much faster to exercise all the logic for creating the files,
|
||||
// garbage collection, compaction etc.
|
||||
pub const TEST_FILE_SIZE: u64 = 4 * 1024 * 1024;
|
||||
|
||||
#[test]
|
||||
fn test_images() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_images")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
@@ -963,7 +1482,7 @@ pub mod tests {
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
@@ -971,7 +1490,7 @@ pub mod tests {
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||
writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
drop(writer);
|
||||
|
||||
@@ -979,18 +1498,18 @@ pub mod tests {
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||
writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||
writer.finish_write(Lsn(0x40));
|
||||
drop(writer);
|
||||
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
tline.compact()?;
|
||||
|
||||
assert_eq!(tline.get(TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
|
||||
assert_eq!(tline.get(TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
|
||||
assert_eq!(tline.get(TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x30))?, TEST_IMG("foo at 0x30"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x40))?, TEST_IMG("foo at 0x40"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -157,7 +157,14 @@ where
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache.read_immutable_buf(self.file_id, blknum) {
|
||||
match cache
|
||||
.read_immutable_buf(self.file_id, blknum)
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
format!("Failed to read immutable buf: {e:#}"),
|
||||
)
|
||||
})? {
|
||||
ReadBufResult::Found(guard) => break Ok(guard),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
|
||||
@@ -12,7 +12,7 @@ use once_cell::sync::Lazy;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::io::{self, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
@@ -51,7 +51,7 @@ impl EphemeralFile {
|
||||
conf: &PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<EphemeralFile, std::io::Error> {
|
||||
) -> Result<EphemeralFile, io::Error> {
|
||||
let mut l = EPHEMERAL_FILES.write().unwrap();
|
||||
let file_id = l.next_file_id;
|
||||
l.next_file_id += 1;
|
||||
@@ -76,7 +76,7 @@ impl EphemeralFile {
|
||||
})
|
||||
}
|
||||
|
||||
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
|
||||
fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
|
||||
let mut off = 0;
|
||||
while off < PAGE_SZ {
|
||||
let n = self
|
||||
@@ -96,10 +96,13 @@ impl EphemeralFile {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, Error> {
|
||||
fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
let mut write_guard = match cache.write_ephemeral_buf(self.file_id, blkno) {
|
||||
let mut write_guard = match cache
|
||||
.write_ephemeral_buf(self.file_id, blkno)
|
||||
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
|
||||
{
|
||||
WriteBufResult::Found(guard) => guard,
|
||||
WriteBufResult::NotFound(mut guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
@@ -127,7 +130,7 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
|
||||
}
|
||||
|
||||
impl FileExt for EphemeralFile {
|
||||
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
@@ -137,7 +140,10 @@ impl FileExt for EphemeralFile {
|
||||
let mut write_guard;
|
||||
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
|
||||
let buf = match cache
|
||||
.read_ephemeral_buf(self.file_id, blkno)
|
||||
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => {
|
||||
read_guard = guard;
|
||||
read_guard.as_ref()
|
||||
@@ -158,7 +164,7 @@ impl FileExt for EphemeralFile {
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
@@ -166,7 +172,10 @@ impl FileExt for EphemeralFile {
|
||||
|
||||
let mut write_guard;
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
|
||||
let buf = match cache
|
||||
.write_ephemeral_buf(self.file_id, blkno)
|
||||
.map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
|
||||
{
|
||||
WriteBufResult::Found(guard) => {
|
||||
write_guard = guard;
|
||||
write_guard.deref_mut()
|
||||
@@ -190,7 +199,7 @@ impl FileExt for EphemeralFile {
|
||||
}
|
||||
|
||||
impl BlobWriter for EphemeralFile {
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
|
||||
let pos = self.size;
|
||||
|
||||
let mut blknum = (self.size / PAGE_SZ as u64) as u32;
|
||||
@@ -268,11 +277,11 @@ impl Drop for EphemeralFile {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(std::io::Error::new(
|
||||
Err(e) => Err(io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
@@ -282,7 +291,7 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er
|
||||
)),
|
||||
}
|
||||
} else {
|
||||
Err(std::io::Error::new(
|
||||
Err(io::Error::new(
|
||||
ErrorKind::Other,
|
||||
"could not write back page, not found in ephemeral files hash",
|
||||
))
|
||||
@@ -292,11 +301,14 @@ pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Er
|
||||
impl BlockReader for EphemeralFile {
|
||||
type BlockLease = page_cache::PageReadGuard<'static>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
|
||||
// Look up the right page
|
||||
let cache = page_cache::get();
|
||||
loop {
|
||||
match cache.read_ephemeral_buf(self.file_id, blknum) {
|
||||
match cache
|
||||
.read_ephemeral_buf(self.file_id, blknum)
|
||||
.map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
|
||||
{
|
||||
ReadBufResult::Found(guard) => return Ok(guard),
|
||||
ReadBufResult::NotFound(mut write_guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
@@ -311,6 +323,10 @@ impl BlockReader for EphemeralFile {
|
||||
}
|
||||
}
|
||||
|
||||
fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
|
||||
io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -322,7 +338,7 @@ mod tests {
|
||||
|
||||
fn repo_harness(
|
||||
test_name: &str,
|
||||
) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
|
||||
) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), io::Error> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
@@ -339,7 +355,7 @@ mod tests {
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
// into a string
|
||||
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
|
||||
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
|
||||
let mut buf = Vec::new();
|
||||
buf.resize(len, 0u8);
|
||||
|
||||
@@ -351,7 +367,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_files() -> Result<(), Error> {
|
||||
fn test_ephemeral_files() -> Result<(), io::Error> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
|
||||
|
||||
let file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
@@ -382,7 +398,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_blobs() -> Result<(), Error> {
|
||||
fn test_ephemeral_blobs() -> Result<(), io::Error> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("ephemeral_blobs")?;
|
||||
|
||||
let mut file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::path::PathBuf;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
// Note: Timeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct DeltaFileName {
|
||||
pub key_range: Range<Key>,
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
|
||||
//! Every image of a certain timeline from [`crate::layered_repository::Repository`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
||||
@@ -6,10 +6,13 @@
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::ensure;
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info_span;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
@@ -17,6 +20,7 @@ use utils::{
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::STORAGE_FORMAT_VERSION;
|
||||
|
||||
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
@@ -30,7 +34,7 @@ pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
/// The fields correspond to the values we hold in memory, in Timeline.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineMetadata {
|
||||
hdr: TimelineMetadataHeader,
|
||||
@@ -65,17 +69,6 @@ struct TimelineMetadataBody {
|
||||
initdb_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
pub fn new(
|
||||
disk_consistent_lsn: Lsn,
|
||||
@@ -173,11 +166,57 @@ impl TimelineMetadata {
|
||||
}
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
pub fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("saving metadata").entered();
|
||||
let path = metadata_path(conf, timelineid, tenantid);
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
OpenOptions::new().write(true).create_new(first_save),
|
||||
)?;
|
||||
|
||||
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
|
||||
|
||||
if file.write(&metadata_bytes)? != metadata_bytes.len() {
|
||||
bail!("Could not write all the metadata bytes in a single call");
|
||||
}
|
||||
file.sync_all()?;
|
||||
|
||||
// fsync the parent directory to ensure the directory entry is durable
|
||||
if first_save {
|
||||
let timeline_dir = File::open(
|
||||
&path
|
||||
.parent()
|
||||
.expect("Metadata should always have a parent dir"),
|
||||
)?;
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::repository::repo_harness::TIMELINE_ID;
|
||||
|
||||
use super::*;
|
||||
use crate::layered_repository::repo_harness::TIMELINE_ID;
|
||||
|
||||
#[test]
|
||||
fn metadata_serializes_correctly() {
|
||||
|
||||
@@ -9,14 +9,12 @@ use once_cell::sync::Lazy;
|
||||
use tracing::*;
|
||||
|
||||
use std::cmp::{max, min, Ordering};
|
||||
use std::collections::{hash_map::Entry, HashMap, HashSet};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::ops::{Deref, Range};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{self, AtomicBool, AtomicIsize, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, RwLockReadGuard, TryLockError};
|
||||
use std::sync::atomic::{self, AtomicBool, AtomicI64, Ordering as AtomicOrdering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard, RwLock, TryLockError};
|
||||
use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use metrics::{
|
||||
@@ -32,7 +30,7 @@ use crate::layered_repository::{
|
||||
image_layer::{ImageLayer, ImageLayerWriter},
|
||||
inmemory_layer::InMemoryLayer,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
|
||||
metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME},
|
||||
par_fsync,
|
||||
storage_layer::{Layer, ValueReconstructResult, ValueReconstructState},
|
||||
};
|
||||
@@ -43,19 +41,18 @@ use crate::pgdatadir_mapping::BlockNumber;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::DatadirTimeline;
|
||||
|
||||
use postgres_ffi::v14::xlog_utils::to_pg_timestamp;
|
||||
use utils::{
|
||||
lsn::{AtomicLsn, Lsn, RecordLsn},
|
||||
seqwait::SeqWait,
|
||||
simple_rcu::{Rcu, RcuReadGuard},
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::repository::{GcResult, RepositoryTimeline, Timeline, TimelineWriter};
|
||||
use crate::repository::{GcResult, RepositoryTimeline};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::thread_mgr;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walreceiver::IS_WAL_RECEIVER;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
@@ -140,6 +137,15 @@ static CURRENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static CURRENT_LOGICAL_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
|
||||
register_int_gauge_vec!(
|
||||
"pageserver_current_logical_size",
|
||||
"Current logical size grouped by timeline",
|
||||
&["tenant_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
// Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
|
||||
// or in testing they estimate how much we would upload if we did.
|
||||
static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounter> = Lazy::new(|| {
|
||||
@@ -160,7 +166,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounter> = Lazy::new(|| {
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LayeredTimelineEntry {
|
||||
Loaded(Arc<LayeredTimeline>),
|
||||
Loaded(Arc<Timeline>),
|
||||
Unloaded {
|
||||
id: ZTimelineId,
|
||||
metadata: TimelineMetadata,
|
||||
@@ -191,7 +197,7 @@ impl LayeredTimelineEntry {
|
||||
}
|
||||
}
|
||||
|
||||
fn ensure_loaded(&self) -> anyhow::Result<&Arc<LayeredTimeline>> {
|
||||
fn ensure_loaded(&self) -> anyhow::Result<&Arc<Timeline>> {
|
||||
match self {
|
||||
LayeredTimelineEntry::Loaded(timeline) => Ok(timeline),
|
||||
LayeredTimelineEntry::Unloaded { .. } => {
|
||||
@@ -213,7 +219,7 @@ impl LayeredTimelineEntry {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LayeredTimelineEntry> for RepositoryTimeline<LayeredTimeline> {
|
||||
impl From<LayeredTimelineEntry> for RepositoryTimeline<Timeline> {
|
||||
fn from(entry: LayeredTimelineEntry) -> Self {
|
||||
match entry {
|
||||
LayeredTimelineEntry::Loaded(timeline) => RepositoryTimeline::Loaded(timeline as _),
|
||||
@@ -235,6 +241,8 @@ struct TimelineMetrics {
|
||||
pub last_record_gauge: IntGauge,
|
||||
pub wait_lsn_time_histo: Histogram,
|
||||
pub current_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: IntGauge,
|
||||
}
|
||||
|
||||
impl TimelineMetrics {
|
||||
@@ -272,6 +280,9 @@ impl TimelineMetrics {
|
||||
let current_physical_size_gauge = CURRENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &timeline_id])
|
||||
.unwrap();
|
||||
|
||||
TimelineMetrics {
|
||||
reconstruct_time_histo,
|
||||
@@ -284,11 +295,12 @@ impl TimelineMetrics {
|
||||
last_record_gauge,
|
||||
wait_lsn_time_histo,
|
||||
current_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayeredTimeline {
|
||||
pub struct Timeline {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: Arc<RwLock<TenantConfOpt>>,
|
||||
|
||||
@@ -340,8 +352,8 @@ pub struct LayeredTimeline {
|
||||
upload_layers: AtomicBool,
|
||||
|
||||
/// Ensures layers aren't frozen by checkpointer between
|
||||
/// [`LayeredTimeline::get_layer_for_write`] and layer reads.
|
||||
/// Locked automatically by [`LayeredTimelineWriter`] and checkpointer.
|
||||
/// [`Timeline::get_layer_for_write`] and layer reads.
|
||||
/// Locked automatically by [`TimelineWriter`] and checkpointer.
|
||||
/// Must always be acquired before the layer map/individual layer lock
|
||||
/// to avoid deadlock.
|
||||
write_lock: Mutex<()>,
|
||||
@@ -351,12 +363,12 @@ pub struct LayeredTimeline {
|
||||
|
||||
/// Layer removal lock.
|
||||
/// A lock to ensure that no layer of the timeline is removed concurrently by other threads.
|
||||
/// This lock is acquired in [`LayeredTimeline::gc`], [`LayeredTimeline::compact`],
|
||||
/// and [`LayeredRepository::delete_timeline`].
|
||||
/// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
|
||||
/// and [`Repository::delete_timeline`].
|
||||
layer_removal_cs: Mutex<()>,
|
||||
|
||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||
pub latest_gc_cutoff_lsn: RwLock<Lsn>,
|
||||
pub latest_gc_cutoff_lsn: Rcu<Lsn>,
|
||||
|
||||
// List of child timelines and their branch points. This is needed to avoid
|
||||
// garbage collecting data that is still needed by the child timelines.
|
||||
@@ -377,7 +389,32 @@ pub struct LayeredTimeline {
|
||||
repartition_threshold: u64,
|
||||
|
||||
/// Current logical size of the "datadir", at the last LSN.
|
||||
current_logical_size: AtomicIsize,
|
||||
///
|
||||
/// Size shouldn't ever be negative, but this is signed for two reasons:
|
||||
///
|
||||
/// 1. If we initialized the "baseline" size lazily, while we already
|
||||
/// process incoming WAL, the incoming WAL records could decrement the
|
||||
/// variable and temporarily make it negative. (This is just future-proofing;
|
||||
/// the initialization is currently not done lazily.)
|
||||
///
|
||||
/// 2. If there is a bug and we e.g. forget to increment it in some cases
|
||||
/// when size grows, but remember to decrement it when it shrinks again, the
|
||||
/// variable could go negative. In that case, it seems better to at least
|
||||
/// try to keep tracking it, rather than clamp or overflow it. Note that
|
||||
/// get_current_logical_size() will clamp the returned value to zero if it's
|
||||
/// negative, and log an error. Could set it permanently to zero or some
|
||||
/// special value to indicate "broken" instead, but this will do for now.
|
||||
///
|
||||
/// Note that we also expose a copy of this value as a prometheus metric,
|
||||
/// see `current_logical_size_gauge`. Use the `update_current_logical_size`
|
||||
/// and `set_current_logical_size` functions to modify this, they will
|
||||
/// also keep the prometheus metric in sync.
|
||||
current_logical_size: AtomicI64,
|
||||
// TODO we don't have a good, API to ensure on a compilation level
|
||||
// that the timeline passes all initialization.
|
||||
// Hence we ensure that we init at least once for every timeline
|
||||
// and keep this flag to avoid potentually long recomputes.
|
||||
logical_size_initialized: AtomicBool,
|
||||
|
||||
/// Information about the last processed message by the WAL receiver,
|
||||
/// or None if WAL receiver has not received anything for this timeline
|
||||
@@ -385,7 +422,7 @@ pub struct LayeredTimeline {
|
||||
pub last_received_wal: Mutex<Option<WalReceiverInfo>>,
|
||||
|
||||
/// Relation size cache
|
||||
rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
|
||||
pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
|
||||
}
|
||||
|
||||
pub struct WalReceiverInfo {
|
||||
@@ -394,46 +431,6 @@ pub struct WalReceiverInfo {
|
||||
pub last_received_msg_ts: u128,
|
||||
}
|
||||
|
||||
/// Inherit all the functions from DatadirTimeline, to provide the
|
||||
/// functionality to store PostgreSQL relations, SLRUs, etc. in a
|
||||
/// LayeredTimeline.
|
||||
impl DatadirTimeline for LayeredTimeline {
|
||||
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
|
||||
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
|
||||
if lsn >= *cached_lsn {
|
||||
return Some(*nblocks);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
match rel_size_cache.entry(tag) {
|
||||
Entry::Occupied(mut entry) => {
|
||||
let cached_lsn = entry.get_mut();
|
||||
if lsn >= cached_lsn.0 {
|
||||
*cached_lsn = (lsn, nblocks);
|
||||
}
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert((lsn, nblocks));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.insert(tag, (lsn, nblocks));
|
||||
}
|
||||
|
||||
fn remove_cached_rel_size(&self, tag: &RelTag) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.remove(tag);
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Information about how much history needs to be retained, needed by
|
||||
/// Garbage Collection.
|
||||
@@ -464,45 +461,37 @@ pub struct GcInfo {
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
impl Timeline for LayeredTimeline {
|
||||
fn get_ancestor_lsn(&self) -> Lsn {
|
||||
impl Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Get the LSN where this branch was created
|
||||
pub fn get_ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
|
||||
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId> {
|
||||
/// Get the ancestor's timeline id
|
||||
pub fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId> {
|
||||
self.ancestor_timeline
|
||||
.as_ref()
|
||||
.map(LayeredTimelineEntry::timeline_id)
|
||||
}
|
||||
|
||||
/// Wait until WAL has been received up to the given LSN.
|
||||
fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
// This should never be called from the WAL receiver thread, because that could lead
|
||||
// to a deadlock.
|
||||
ensure!(
|
||||
!IS_WAL_RECEIVER.with(|c| c.get()),
|
||||
"wait_lsn called by WAL receiver thread"
|
||||
);
|
||||
|
||||
self.metrics.wait_lsn_time_histo.observe_closure_duration(
|
||||
|| self.last_record_lsn
|
||||
.wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
|
||||
lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
|
||||
)
|
||||
}))?;
|
||||
|
||||
Ok(())
|
||||
/// Lock and get timeline's GC cuttof
|
||||
pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
|
||||
self.latest_gc_cutoff_lsn.read()
|
||||
}
|
||||
|
||||
fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn> {
|
||||
self.latest_gc_cutoff_lsn.read().unwrap()
|
||||
}
|
||||
|
||||
/// Look up the value with the given a key
|
||||
fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes> {
|
||||
/// Look up given page version.
|
||||
///
|
||||
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction
|
||||
/// above this needs to store suitable metadata to track what data exists with
|
||||
/// what keys, in separate metadata entries. If a non-existent key is requested,
|
||||
/// the Repository implementation may incorrectly return a value from an ancestor
|
||||
/// branch, for example, or waste a lot of cycles chasing the non-existing key.
|
||||
///
|
||||
pub fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes> {
|
||||
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
|
||||
// The cached image can be returned directly if there is no WAL between the cached image
|
||||
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
|
||||
@@ -531,68 +520,31 @@ impl Timeline for LayeredTimeline {
|
||||
.observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
|
||||
}
|
||||
|
||||
/// Public entry point for checkpoint(). All the logic is in the private
|
||||
/// checkpoint_internal function, this public facade just wraps it for
|
||||
/// metrics collection.
|
||||
fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
|
||||
match cconf {
|
||||
CheckpointConfig::Flush => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers(true)
|
||||
}
|
||||
CheckpointConfig::Forced => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers(true)?;
|
||||
self.compact()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn.
|
||||
///
|
||||
fn check_lsn_is_in_scope(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
|
||||
) -> Result<()> {
|
||||
ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
|
||||
lsn,
|
||||
**latest_gc_cutoff_lsn,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_last_record_lsn(&self) -> Lsn {
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
pub fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
}
|
||||
|
||||
fn get_prev_record_lsn(&self) -> Lsn {
|
||||
pub fn get_prev_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().prev
|
||||
}
|
||||
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn {
|
||||
/// Atomically get both last and prev.
|
||||
pub fn get_last_record_rlsn(&self) -> RecordLsn {
|
||||
self.last_record_lsn.load()
|
||||
}
|
||||
|
||||
fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||
pub fn get_disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn.load()
|
||||
}
|
||||
|
||||
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a> {
|
||||
Box::new(LayeredTimelineWriter {
|
||||
tl: self,
|
||||
_write_guard: self.write_lock.lock().unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
fn get_physical_size(&self) -> u64 {
|
||||
/// Get the physical size of the timeline at the latest LSN
|
||||
pub fn get_physical_size(&self) -> u64 {
|
||||
self.metrics.current_physical_size_gauge.get()
|
||||
}
|
||||
|
||||
fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
|
||||
/// Get the physical size of the timeline at the latest LSN non incrementally
|
||||
pub fn get_physical_size_non_incremental(&self) -> anyhow::Result<u64> {
|
||||
let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
|
||||
// total size of layer files in the current timeline directory
|
||||
let mut total_physical_size = 0;
|
||||
@@ -611,9 +563,88 @@ impl Timeline for LayeredTimeline {
|
||||
|
||||
Ok(total_physical_size)
|
||||
}
|
||||
|
||||
///
|
||||
/// Wait until WAL has been received and processed up to this LSN.
|
||||
///
|
||||
/// You should call this before any of the other get_* or list_* functions. Calling
|
||||
/// those functions with an LSN that has been processed yet is an error.
|
||||
///
|
||||
pub fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
|
||||
// This should never be called from the WAL receiver thread, because that could lead
|
||||
// to a deadlock.
|
||||
ensure!(
|
||||
!IS_WAL_RECEIVER.with(|c| c.get()),
|
||||
"wait_lsn called by WAL receiver thread"
|
||||
);
|
||||
|
||||
self.metrics.wait_lsn_time_histo.observe_closure_duration(
|
||||
|| self.last_record_lsn
|
||||
.wait_for_timeout(lsn, self.conf.wait_lsn_timeout)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}",
|
||||
lsn, self.get_last_record_lsn(), self.get_disk_consistent_lsn()
|
||||
)
|
||||
}))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Check that it is valid to request operations with that lsn.
|
||||
pub fn check_lsn_is_in_scope(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
) -> Result<()> {
|
||||
ensure!(
|
||||
lsn >= **latest_gc_cutoff_lsn,
|
||||
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
|
||||
lsn,
|
||||
**latest_gc_cutoff_lsn,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
pub fn checkpoint(&self, cconf: CheckpointConfig) -> anyhow::Result<()> {
|
||||
match cconf {
|
||||
CheckpointConfig::Flush => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers(true)
|
||||
}
|
||||
CheckpointConfig::Forced => {
|
||||
self.freeze_inmem_layer(false);
|
||||
self.flush_frozen_layers(true)?;
|
||||
self.compact()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
///
|
||||
/// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
|
||||
/// is a generic type in this trait. But that doesn't currently work in
|
||||
/// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
|
||||
pub fn writer(&self) -> TimelineWriter<'_> {
|
||||
TimelineWriter {
|
||||
tl: self,
|
||||
_write_guard: self.write_lock.lock().unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl LayeredTimeline {
|
||||
// Private functions
|
||||
impl Timeline {
|
||||
fn get_checkpoint_distance(&self) -> u64 {
|
||||
let tenant_conf = self.tenant_conf.read().unwrap();
|
||||
tenant_conf
|
||||
@@ -662,8 +693,8 @@ impl LayeredTimeline {
|
||||
tenant_id: ZTenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
upload_layers: bool,
|
||||
) -> LayeredTimeline {
|
||||
let mut result = LayeredTimeline {
|
||||
) -> Timeline {
|
||||
let mut result = Timeline {
|
||||
conf,
|
||||
tenant_conf,
|
||||
timeline_id,
|
||||
@@ -699,10 +730,11 @@ impl LayeredTimeline {
|
||||
pitr_cutoff: Lsn(0),
|
||||
}),
|
||||
|
||||
latest_gc_cutoff_lsn: RwLock::new(metadata.latest_gc_cutoff_lsn()),
|
||||
latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
|
||||
initdb_lsn: metadata.initdb_lsn(),
|
||||
|
||||
current_logical_size: AtomicIsize::new(0),
|
||||
current_logical_size: AtomicI64::new(0),
|
||||
logical_size_initialized: AtomicBool::new(false),
|
||||
partitioning: Mutex::new((KeyPartitioning::new(), Lsn(0))),
|
||||
repartition_threshold: 0,
|
||||
|
||||
@@ -807,6 +839,10 @@ impl LayeredTimeline {
|
||||
///
|
||||
/// This can be a slow operation.
|
||||
pub fn init_logical_size(&self) -> Result<()> {
|
||||
if self.logical_size_initialized.load(AtomicOrdering::Acquire) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Try a fast-path first:
|
||||
// Copy logical size from ancestor timeline if there has been no changes on this
|
||||
// branch, and no changes on the ancestor branch since the branch point.
|
||||
@@ -819,8 +855,7 @@ impl LayeredTimeline {
|
||||
//
|
||||
// Logical size 0 means that it was not initialized, so don't believe that.
|
||||
if ancestor_logical_size != 0 && ancestor.get_last_record_lsn() == self.ancestor_lsn {
|
||||
self.current_logical_size
|
||||
.store(ancestor_logical_size as isize, AtomicOrdering::SeqCst);
|
||||
self.set_current_logical_size(ancestor_logical_size);
|
||||
debug!(
|
||||
"logical size copied from ancestor: {}",
|
||||
ancestor_logical_size
|
||||
@@ -834,8 +869,7 @@ impl LayeredTimeline {
|
||||
// Have to calculate it the hard way
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let logical_size = self.get_current_logical_size_non_incremental(last_lsn)?;
|
||||
self.current_logical_size
|
||||
.store(logical_size as isize, AtomicOrdering::SeqCst);
|
||||
self.set_current_logical_size(logical_size);
|
||||
debug!("calculated logical size the hard way: {}", logical_size);
|
||||
|
||||
timer.stop_and_record();
|
||||
@@ -844,10 +878,10 @@ impl LayeredTimeline {
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors,
|
||||
pub fn get_current_logical_size(&self) -> usize {
|
||||
/// NOTE: counted incrementally, includes ancestors.
|
||||
pub fn get_current_logical_size(&self) -> u64 {
|
||||
let current_logical_size = self.current_logical_size.load(AtomicOrdering::Acquire);
|
||||
match usize::try_from(current_logical_size) {
|
||||
match u64::try_from(current_logical_size) {
|
||||
Ok(sz) => sz,
|
||||
Err(_) => {
|
||||
error!(
|
||||
@@ -859,6 +893,36 @@ impl LayeredTimeline {
|
||||
}
|
||||
}
|
||||
|
||||
/// Update current logical size, adding `delta' to the old value.
|
||||
fn update_current_logical_size(&self, delta: i64) {
|
||||
let new_size = self
|
||||
.current_logical_size
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
|
||||
// Also set the value in the prometheus gauge. Note that
|
||||
// there is a race condition here: if this is is called by two
|
||||
// threads concurrently, the prometheus gauge might be set to
|
||||
// one value while current_logical_size is set to the
|
||||
// other. Currently, only initialization and the WAL receiver
|
||||
// updates the logical size, and they don't run concurrently,
|
||||
// so it cannot happen. And even if it did, it wouldn't be
|
||||
// very serious, the metrics would just be slightly off until
|
||||
// the next update.
|
||||
self.metrics.current_logical_size_gauge.set(new_size);
|
||||
}
|
||||
|
||||
/// Set current logical size.
|
||||
fn set_current_logical_size(&self, new_size: u64) {
|
||||
self.current_logical_size
|
||||
.store(new_size as i64, AtomicOrdering::SeqCst);
|
||||
self.logical_size_initialized
|
||||
.store(true, AtomicOrdering::SeqCst);
|
||||
|
||||
// Also set the value in the prometheus gauge. Same race condition
|
||||
// here as in `update_current_logical_size`.
|
||||
self.metrics.current_logical_size_gauge.set(new_size as i64);
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to a Layer for reading.
|
||||
///
|
||||
@@ -1014,7 +1078,7 @@ impl LayeredTimeline {
|
||||
Some((lsn, img))
|
||||
}
|
||||
|
||||
fn get_ancestor_timeline(&self) -> Result<Arc<LayeredTimeline>> {
|
||||
fn get_ancestor_timeline(&self) -> Result<Arc<Timeline>> {
|
||||
let ancestor = self
|
||||
.ancestor_timeline
|
||||
.as_ref()
|
||||
@@ -1135,7 +1199,7 @@ impl LayeredTimeline {
|
||||
/// Also flush after a period of time without new data -- it helps
|
||||
/// safekeepers to regard pageserver as caught up and suspend activity.
|
||||
///
|
||||
pub fn check_checkpoint_distance(self: &Arc<LayeredTimeline>) -> Result<()> {
|
||||
pub fn check_checkpoint_distance(self: &Arc<Timeline>) -> Result<()> {
|
||||
let last_lsn = self.get_last_record_lsn();
|
||||
let layers = self.layers.read().unwrap();
|
||||
if let Some(open_layer) = &layers.open_layer {
|
||||
@@ -1314,7 +1378,7 @@ impl LayeredTimeline {
|
||||
ondisk_prev_record_lsn,
|
||||
ancestor_timelineid,
|
||||
self.ancestor_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read().unwrap(),
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
self.initdb_lsn,
|
||||
);
|
||||
|
||||
@@ -1969,9 +2033,21 @@ impl LayeredTimeline {
|
||||
|
||||
let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered();
|
||||
|
||||
// We need to ensure that no one branches at a point before latest_gc_cutoff_lsn.
|
||||
// See branch_timeline() for details.
|
||||
*self.latest_gc_cutoff_lsn.write().unwrap() = new_gc_cutoff;
|
||||
// We need to ensure that no one tries to read page versions or create
|
||||
// branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
|
||||
// for details. This will block until the old value is no longer in use.
|
||||
//
|
||||
// The GC cutoff should only ever move forwards.
|
||||
{
|
||||
let write_guard = self.latest_gc_cutoff_lsn.write();
|
||||
ensure!(
|
||||
*write_guard <= new_gc_cutoff,
|
||||
"Cannot move GC cutoff LSN backwards (was {}, new {})",
|
||||
*write_guard,
|
||||
new_gc_cutoff
|
||||
);
|
||||
write_guard.store(new_gc_cutoff);
|
||||
}
|
||||
|
||||
info!("GC starting");
|
||||
|
||||
@@ -2117,7 +2193,7 @@ impl LayeredTimeline {
|
||||
key: Key,
|
||||
request_lsn: Lsn,
|
||||
mut data: ValueReconstructState,
|
||||
) -> Result<Bytes> {
|
||||
) -> anyhow::Result<Bytes> {
|
||||
// Perform WAL redo if needed
|
||||
data.records.reverse();
|
||||
|
||||
@@ -2167,13 +2243,15 @@ impl LayeredTimeline {
|
||||
|
||||
if img.len() == page_cache::PAGE_SZ {
|
||||
let cache = page_cache::get();
|
||||
cache.memorize_materialized_page(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
key,
|
||||
last_rec_lsn,
|
||||
&img,
|
||||
);
|
||||
cache
|
||||
.memorize_materialized_page(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
key,
|
||||
last_rec_lsn,
|
||||
&img,
|
||||
)
|
||||
.context("Materialized page memoization failed")?;
|
||||
}
|
||||
|
||||
Ok(img)
|
||||
@@ -2208,39 +2286,50 @@ fn layer_traversal_error(
|
||||
Err(msg_iter.fold(err, |err, msg| err.context(msg)))
|
||||
}
|
||||
|
||||
struct LayeredTimelineWriter<'a> {
|
||||
tl: &'a LayeredTimeline,
|
||||
/// Various functions to mutate the timeline.
|
||||
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
|
||||
// This is probably considered a bad practice in Rust and should be fixed eventually,
|
||||
// but will cause large code changes.
|
||||
pub struct TimelineWriter<'a> {
|
||||
tl: &'a Timeline,
|
||||
_write_guard: MutexGuard<'a, ()>,
|
||||
}
|
||||
|
||||
impl Deref for LayeredTimelineWriter<'_> {
|
||||
type Target = dyn Timeline;
|
||||
impl Deref for TimelineWriter<'_> {
|
||||
type Target = Timeline;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.tl
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> {
|
||||
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
|
||||
impl<'a> TimelineWriter<'a> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
pub fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
|
||||
self.tl.put_value(key, lsn, value)
|
||||
}
|
||||
|
||||
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
|
||||
pub fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()> {
|
||||
self.tl.put_tombstone(key_range, lsn)
|
||||
}
|
||||
|
||||
///
|
||||
/// Track the end of the latest digested WAL record.
|
||||
/// Remember the (end of) last valid WAL record remembered in the timeline.
|
||||
///
|
||||
fn finish_write(&self, new_lsn: Lsn) {
|
||||
/// Call this after you have finished writing all the WAL up to 'lsn'.
|
||||
///
|
||||
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
|
||||
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
|
||||
/// the latest and can be read.
|
||||
pub fn finish_write(&self, new_lsn: Lsn) {
|
||||
self.tl.finish_write(new_lsn);
|
||||
}
|
||||
|
||||
fn update_current_logical_size(&self, delta: isize) {
|
||||
self.tl
|
||||
.current_logical_size
|
||||
.fetch_add(delta, AtomicOrdering::SeqCst);
|
||||
pub fn update_current_logical_size(&self, delta: i64) {
|
||||
self.tl.update_current_logical_size(delta)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2263,39 +2352,3 @@ fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
|
||||
bail!("couldn't find an unused backup number for {:?}", path)
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
pub fn save_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
data: &TimelineMetadata,
|
||||
first_save: bool,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("saving metadata").entered();
|
||||
let path = metadata_path(conf, timelineid, tenantid);
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
OpenOptions::new().write(true).create_new(first_save),
|
||||
)?;
|
||||
|
||||
let metadata_bytes = data.to_bytes().context("Failed to get metadata bytes")?;
|
||||
|
||||
if file.write(&metadata_bytes)? != metadata_bytes.len() {
|
||||
bail!("Could not write all the metadata bytes in a single call");
|
||||
}
|
||||
file.sync_all()?;
|
||||
|
||||
// fsync the parent directory to ensure the directory entry is durable
|
||||
if first_save {
|
||||
let timeline_dir = File::open(
|
||||
&path
|
||||
.parent()
|
||||
.expect("Metadata should always have a parent dir"),
|
||||
)?;
|
||||
timeline_dir.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -28,8 +28,6 @@ use tracing::info;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
|
||||
use pgdatadir_mapping::DatadirTimeline;
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
/// This is embedded in the metadata file, and also in the header of all the
|
||||
|
||||
@@ -45,6 +45,7 @@ use std::{
|
||||
},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use utils::{
|
||||
@@ -342,7 +343,7 @@ impl PageCache {
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
img: &[u8],
|
||||
) {
|
||||
) -> anyhow::Result<()> {
|
||||
let cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
@@ -352,7 +353,7 @@ impl PageCache {
|
||||
lsn,
|
||||
};
|
||||
|
||||
match self.lock_for_write(&cache_key) {
|
||||
match self.lock_for_write(&cache_key)? {
|
||||
WriteBufResult::Found(write_guard) => {
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
@@ -364,17 +365,19 @@ impl PageCache {
|
||||
write_guard.mark_valid();
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with Ephemeral pages.
|
||||
|
||||
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
|
||||
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
|
||||
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
|
||||
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
|
||||
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_write(&cache_key)
|
||||
@@ -402,7 +405,7 @@ impl PageCache {
|
||||
|
||||
// Section 1.3: Public interface functions for working with immutable file pages.
|
||||
|
||||
pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
|
||||
pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
|
||||
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key)
|
||||
@@ -495,15 +498,16 @@ impl PageCache {
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
|
||||
fn lock_for_read(&self, cache_key: &mut CacheKey) -> anyhow::Result<ReadBufResult> {
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
|
||||
return ReadBufResult::Found(read_guard);
|
||||
return Ok(ReadBufResult::Found(read_guard));
|
||||
}
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self.find_victim();
|
||||
let (slot_idx, mut inner) =
|
||||
self.find_victim().context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
@@ -526,10 +530,10 @@ impl PageCache {
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return ReadBufResult::NotFound(PageWriteGuard {
|
||||
return Ok(ReadBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
valid: false,
|
||||
});
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -556,15 +560,16 @@ impl PageCache {
|
||||
///
|
||||
/// Similar to lock_for_read(), but the returned buffer is write-locked and
|
||||
/// may be modified by the caller even if it's already found in the cache.
|
||||
fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
|
||||
fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key) {
|
||||
return WriteBufResult::Found(write_guard);
|
||||
return Ok(WriteBufResult::Found(write_guard));
|
||||
}
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self.find_victim();
|
||||
let (slot_idx, mut inner) =
|
||||
self.find_victim().context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
@@ -587,10 +592,10 @@ impl PageCache {
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return WriteBufResult::NotFound(PageWriteGuard {
|
||||
return Ok(WriteBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
valid: false,
|
||||
});
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -754,7 +759,7 @@ impl PageCache {
|
||||
/// Find a slot to evict.
|
||||
///
|
||||
/// On return, the slot is empty and write-locked.
|
||||
fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
|
||||
fn find_victim(&self) -> anyhow::Result<(usize, RwLockWriteGuard<SlotInner>)> {
|
||||
let iter_limit = self.slots.len() * 10;
|
||||
let mut iters = 0;
|
||||
loop {
|
||||
@@ -767,7 +772,7 @@ impl PageCache {
|
||||
let mut inner = match slot.inner.try_write() {
|
||||
Ok(inner) => inner,
|
||||
Err(TryLockError::Poisoned(err)) => {
|
||||
panic!("buffer lock was poisoned: {:?}", err)
|
||||
anyhow::bail!("buffer lock was poisoned: {err:?}")
|
||||
}
|
||||
Err(TryLockError::WouldBlock) => {
|
||||
// If we have looped through the whole buffer pool 10 times
|
||||
@@ -777,7 +782,7 @@ impl PageCache {
|
||||
// there are buffers in the pool. In practice, with a reasonably
|
||||
// large buffer pool it really shouldn't happen.
|
||||
if iters > iter_limit {
|
||||
panic!("could not find a victim buffer to evict");
|
||||
anyhow::bail!("exceeded evict iter limit");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@@ -804,7 +809,7 @@ impl PageCache {
|
||||
inner.dirty = false;
|
||||
inner.key = None;
|
||||
}
|
||||
return (slot_idx, inner);
|
||||
return Ok((slot_idx, inner));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,24 +17,24 @@ use std::io::{self, Read};
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, RwLockReadGuard};
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
auth::{self, Claims, JwtAuth, Scope},
|
||||
lsn::Lsn,
|
||||
postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend},
|
||||
pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC},
|
||||
simple_rcu::RcuReadGuard,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
|
||||
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
@@ -495,22 +495,22 @@ impl PageServerHandler {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists", &tenant_id, &timeline_id])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(timeline.as_ref(), &req)
|
||||
self.handle_get_rel_exists_request(&timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size", &tenant_id, &timeline_id])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_nblocks_request(timeline.as_ref(), &req)
|
||||
self.handle_get_nblocks_request(&timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn", &tenant_id, &timeline_id])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
|
||||
self.handle_get_page_at_lsn_request(&timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_db_size", &tenant_id, &timeline_id])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_db_size_request(timeline.as_ref(), &req)
|
||||
self.handle_db_size_request(&timeline, &req)
|
||||
}),
|
||||
};
|
||||
|
||||
@@ -636,11 +636,11 @@ impl PageServerHandler {
|
||||
/// In either case, if the page server hasn't received the WAL up to the
|
||||
/// requested LSN yet, we will wait for it to arrive. The return value is
|
||||
/// the LSN that should be used to look up the page versions.
|
||||
fn wait_or_get_last_lsn<T: DatadirTimeline>(
|
||||
timeline: &T,
|
||||
fn wait_or_get_last_lsn(
|
||||
timeline: &Timeline,
|
||||
mut lsn: Lsn,
|
||||
latest: bool,
|
||||
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
|
||||
latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
|
||||
) -> Result<Lsn> {
|
||||
if latest {
|
||||
// Latest page version was requested. If LSN is given, it is a hint
|
||||
@@ -684,9 +684,9 @@ impl PageServerHandler {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
fn handle_get_rel_exists_request<T: DatadirTimeline>(
|
||||
fn handle_get_rel_exists_request(
|
||||
&self,
|
||||
timeline: &T,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
@@ -701,9 +701,9 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_nblocks_request<T: DatadirTimeline>(
|
||||
fn handle_get_nblocks_request(
|
||||
&self,
|
||||
timeline: &T,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
@@ -717,9 +717,9 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_db_size_request<T: DatadirTimeline>(
|
||||
fn handle_db_size_request(
|
||||
&self,
|
||||
timeline: &T,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamDbSizeRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered();
|
||||
@@ -735,9 +735,9 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_page_at_lsn_request<T: DatadirTimeline>(
|
||||
fn handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &T,
|
||||
timeline: &Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
|
||||
@@ -745,7 +745,7 @@ impl PageServerHandler {
|
||||
let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
/*
|
||||
// Add a 1s delay to some requests. The delayed causes the requests to
|
||||
// Add a 1s delay to some requests. The delay helps the requests to
|
||||
// hit the race condition from github issue #1047 more easily.
|
||||
use rand::Rng;
|
||||
if rand::thread_rng().gen::<u8>() < 5 {
|
||||
@@ -1077,7 +1077,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("do_gc ") {
|
||||
// Run GC immediately on given timeline.
|
||||
// FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
|
||||
// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
|
||||
// This probably should require special authentication or a global flag to
|
||||
// enable, I don't think we want to or need to allow regular clients to invoke
|
||||
// GC.
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::Timeline;
|
||||
use crate::repository::*;
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
@@ -18,7 +18,7 @@ use postgres_ffi::v14::xlog_utils::TimestampTz;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::{hash_map, HashMap, HashSet};
|
||||
use std::ops::Range;
|
||||
use tracing::{debug, trace, warn};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
@@ -35,23 +35,13 @@ pub enum LsnForTimestamp {
|
||||
}
|
||||
|
||||
///
|
||||
/// This trait provides all the functionality to store PostgreSQL relations, SLRUs,
|
||||
/// This impl provides all the functionality to store PostgreSQL relations, SLRUs,
|
||||
/// and other special kinds of files, in a versioned key-value store. The
|
||||
/// Timeline trait provides the key-value store.
|
||||
/// Timeline struct provides the key-value store.
|
||||
///
|
||||
/// This is a trait, so that we can easily include all these functions in a Timeline
|
||||
/// implementation. You're not expected to have different implementations of this trait,
|
||||
/// rather, this provides an interface and implementation, over Timeline.
|
||||
///
|
||||
/// If you wanted to store other kinds of data in the Neon repository, e.g.
|
||||
/// flat files or MySQL, you would create a new trait like this, with all the
|
||||
/// functions that make sense for the kind of data you're storing. For flat files,
|
||||
/// for example, you might have a function like "fn read(path, offset, size)".
|
||||
/// We might also have that situation in the future, to support multiple PostgreSQL
|
||||
/// versions, if there are big changes in how the data is organized in the data
|
||||
/// directory, or if new special files are introduced.
|
||||
///
|
||||
pub trait DatadirTimeline: Timeline {
|
||||
/// This is a separate impl, so that we can easily include all these functions in a Timeline
|
||||
/// implementation, and might be moved into a separate struct later.
|
||||
impl Timeline {
|
||||
/// Start ingesting a WAL record, or other atomic modification of
|
||||
/// the timeline.
|
||||
///
|
||||
@@ -75,7 +65,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
/// functions of the timeline until you finish! And if you update the
|
||||
/// same page twice, the last update wins.
|
||||
///
|
||||
fn begin_modification(&self, lsn: Lsn) -> DatadirModification<Self>
|
||||
pub fn begin_modification(&self, lsn: Lsn) -> DatadirModification
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
@@ -93,7 +83,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Look up given page version.
|
||||
fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_rel_page_at_lsn(&self, tag: RelTag, blknum: BlockNumber, lsn: Lsn) -> Result<Bytes> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
|
||||
let nblocks = self.get_rel_size(tag, lsn)?;
|
||||
@@ -110,7 +100,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
// Get size of a database in blocks
|
||||
fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
|
||||
pub fn get_db_size(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<usize> {
|
||||
let mut total_blocks = 0;
|
||||
|
||||
let rels = self.list_rels(spcnode, dbnode, lsn)?;
|
||||
@@ -123,7 +113,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
/// Get size of a relation file
|
||||
fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
||||
pub fn get_rel_size(&self, tag: RelTag, lsn: Lsn) -> Result<BlockNumber> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
|
||||
if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
|
||||
@@ -151,7 +141,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
/// Does relation exist?
|
||||
fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
|
||||
pub fn get_rel_exists(&self, tag: RelTag, lsn: Lsn) -> Result<bool> {
|
||||
ensure!(tag.relnode != 0, "invalid relnode");
|
||||
|
||||
// first try to lookup relation in cache
|
||||
@@ -169,7 +159,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
/// Get a list of all existing relations in given tablespace and database.
|
||||
fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
pub fn list_rels(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<HashSet<RelTag>> {
|
||||
// fetch directory listing
|
||||
let key = rel_dir_to_key(spcnode, dbnode);
|
||||
let buf = self.get(key, lsn)?;
|
||||
@@ -187,7 +177,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
/// Look up given SLRU page version.
|
||||
fn get_slru_page_at_lsn(
|
||||
pub fn get_slru_page_at_lsn(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
@@ -199,14 +189,19 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
fn get_slru_segment_size(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<BlockNumber> {
|
||||
pub fn get_slru_segment_size(
|
||||
&self,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
lsn: Lsn,
|
||||
) -> Result<BlockNumber> {
|
||||
let key = slru_segment_size_to_key(kind, segno);
|
||||
let mut buf = self.get(key, lsn)?;
|
||||
Ok(buf.get_u32_le())
|
||||
}
|
||||
|
||||
/// Get size of an SLRU segment
|
||||
fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
|
||||
pub fn get_slru_segment_exists(&self, kind: SlruKind, segno: u32, lsn: Lsn) -> Result<bool> {
|
||||
// fetch directory listing
|
||||
let key = slru_dir_to_key(kind);
|
||||
let buf = self.get(key, lsn)?;
|
||||
@@ -223,7 +218,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
|
||||
pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
|
||||
let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
let max_lsn = self.get_last_record_lsn();
|
||||
@@ -286,7 +281,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
fn is_latest_commit_timestamp_ge_than(
|
||||
pub fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
@@ -317,7 +312,7 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
|
||||
pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
|
||||
// fetch directory entry
|
||||
let key = slru_dir_to_key(kind);
|
||||
|
||||
@@ -327,14 +322,14 @@ pub trait DatadirTimeline: Timeline {
|
||||
Ok(dir.segments)
|
||||
}
|
||||
|
||||
fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_relmap_file(&self, spcnode: Oid, dbnode: Oid, lsn: Lsn) -> Result<Bytes> {
|
||||
let key = relmap_file_key(spcnode, dbnode);
|
||||
|
||||
let buf = self.get(key, lsn)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
|
||||
pub fn list_dbdirs(&self, lsn: Lsn) -> Result<HashMap<(Oid, Oid), bool>> {
|
||||
// fetch directory entry
|
||||
let buf = self.get(DBDIR_KEY, lsn)?;
|
||||
let dir = DbDirectory::des(&buf)?;
|
||||
@@ -342,13 +337,13 @@ pub trait DatadirTimeline: Timeline {
|
||||
Ok(dir.dbdirs)
|
||||
}
|
||||
|
||||
fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_twophase_file(&self, xid: TransactionId, lsn: Lsn) -> Result<Bytes> {
|
||||
let key = twophase_file_key(xid);
|
||||
let buf = self.get(key, lsn)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
|
||||
pub fn list_twophase_files(&self, lsn: Lsn) -> Result<HashSet<TransactionId>> {
|
||||
// fetch directory entry
|
||||
let buf = self.get(TWOPHASEDIR_KEY, lsn)?;
|
||||
let dir = TwoPhaseDirectory::des(&buf)?;
|
||||
@@ -356,11 +351,11 @@ pub trait DatadirTimeline: Timeline {
|
||||
Ok(dir.xids)
|
||||
}
|
||||
|
||||
fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_control_file(&self, lsn: Lsn) -> Result<Bytes> {
|
||||
self.get(CONTROLFILE_KEY, lsn)
|
||||
}
|
||||
|
||||
fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
|
||||
pub fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes> {
|
||||
self.get(CHECKPOINT_KEY, lsn)
|
||||
}
|
||||
|
||||
@@ -369,29 +364,29 @@ pub trait DatadirTimeline: Timeline {
|
||||
///
|
||||
/// Only relation blocks are counted currently. That excludes metadata,
|
||||
/// SLRUs, twophase files etc.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
|
||||
pub fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<u64> {
|
||||
// Fetch list of database dirs and iterate them
|
||||
let buf = self.get(DBDIR_KEY, lsn)?;
|
||||
let dbdir = DbDirectory::des(&buf)?;
|
||||
|
||||
let mut total_size: usize = 0;
|
||||
let mut total_size: u64 = 0;
|
||||
for (spcnode, dbnode) in dbdir.dbdirs.keys() {
|
||||
for rel in self.list_rels(*spcnode, *dbnode, lsn)? {
|
||||
let relsize_key = rel_size_to_key(rel);
|
||||
let mut buf = self.get(relsize_key, lsn)?;
|
||||
let relsize = buf.get_u32_le();
|
||||
|
||||
total_size += relsize as usize;
|
||||
total_size += relsize as u64;
|
||||
}
|
||||
}
|
||||
Ok(total_size * BLCKSZ as usize)
|
||||
Ok(total_size * BLCKSZ as u64)
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a KeySpace that covers all the Keys that are in use at the given LSN.
|
||||
/// Anything that's not listed maybe removed from the underlying storage (from
|
||||
/// that LSN forwards).
|
||||
fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
|
||||
pub fn collect_keyspace(&self, lsn: Lsn) -> Result<KeySpace> {
|
||||
// Iterate through key ranges, greedily packing them into partitions
|
||||
let mut result = KeySpaceAccum::new();
|
||||
|
||||
@@ -465,27 +460,54 @@ pub trait DatadirTimeline: Timeline {
|
||||
}
|
||||
|
||||
/// Get cached size of relation if it not updated after specified LSN
|
||||
fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber>;
|
||||
pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
|
||||
let rel_size_cache = self.rel_size_cache.read().unwrap();
|
||||
if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
|
||||
if lsn >= *cached_lsn {
|
||||
return Some(*nblocks);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Update cached relation size if there is no more recent update
|
||||
fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
|
||||
pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
match rel_size_cache.entry(tag) {
|
||||
hash_map::Entry::Occupied(mut entry) => {
|
||||
let cached_lsn = entry.get_mut();
|
||||
if lsn >= cached_lsn.0 {
|
||||
*cached_lsn = (lsn, nblocks);
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(entry) => {
|
||||
entry.insert((lsn, nblocks));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Store cached relation size
|
||||
fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber);
|
||||
pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.insert(tag, (lsn, nblocks));
|
||||
}
|
||||
|
||||
/// Remove cached relation size
|
||||
fn remove_cached_rel_size(&self, tag: &RelTag);
|
||||
pub fn remove_cached_rel_size(&self, tag: &RelTag) {
|
||||
let mut rel_size_cache = self.rel_size_cache.write().unwrap();
|
||||
rel_size_cache.remove(tag);
|
||||
}
|
||||
}
|
||||
|
||||
/// DatadirModification represents an operation to ingest an atomic set of
|
||||
/// updates to the repository. It is created by the 'begin_record'
|
||||
/// function. It is called for each WAL record, so that all the modifications
|
||||
/// by a one WAL record appear atomic.
|
||||
pub struct DatadirModification<'a, T: DatadirTimeline> {
|
||||
pub struct DatadirModification<'a> {
|
||||
/// The timeline this modification applies to. You can access this to
|
||||
/// read the state, but note that any pending updates are *not* reflected
|
||||
/// in the state in 'tline' yet.
|
||||
pub tline: &'a T,
|
||||
pub tline: &'a Timeline,
|
||||
|
||||
/// Lsn assigned by begin_modification
|
||||
pub lsn: Lsn,
|
||||
@@ -495,10 +517,10 @@ pub struct DatadirModification<'a, T: DatadirTimeline> {
|
||||
// underlying key-value store by the 'finish' function.
|
||||
pending_updates: HashMap<Key, Value>,
|
||||
pending_deletions: Vec<Range<Key>>,
|
||||
pending_nblocks: isize,
|
||||
pending_nblocks: i64,
|
||||
}
|
||||
|
||||
impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
impl<'a> DatadirModification<'a> {
|
||||
/// Initialize a completely new repository.
|
||||
///
|
||||
/// This inserts the directory metadata entries that are assumed to
|
||||
@@ -654,7 +676,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
}
|
||||
|
||||
// Update logical database size.
|
||||
self.pending_nblocks -= total_blocks as isize;
|
||||
self.pending_nblocks -= total_blocks as i64;
|
||||
|
||||
// Delete all relations and metadata files for the spcnode/dnode
|
||||
self.delete(dbdir_key_range(spcnode, dbnode));
|
||||
@@ -697,7 +719,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
let buf = nblocks.to_le_bytes();
|
||||
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
||||
|
||||
self.pending_nblocks += nblocks as isize;
|
||||
self.pending_nblocks += nblocks as i64;
|
||||
|
||||
// Update relation size cache
|
||||
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
|
||||
@@ -727,7 +749,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
|
||||
|
||||
// Update logical database size.
|
||||
self.pending_nblocks -= old_size as isize - nblocks as isize;
|
||||
self.pending_nblocks -= old_size as i64 - nblocks as i64;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -749,7 +771,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
// Update relation size cache
|
||||
self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
|
||||
|
||||
self.pending_nblocks += nblocks as isize - old_size as isize;
|
||||
self.pending_nblocks += nblocks as i64 - old_size as i64;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -772,7 +794,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
// update logical size
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key)?.get_u32_le();
|
||||
self.pending_nblocks -= old_size as isize;
|
||||
self.pending_nblocks -= old_size as i64;
|
||||
|
||||
// Remove enty from relation size cache
|
||||
self.tline.remove_cached_rel_size(&rel);
|
||||
@@ -914,7 +936,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
result?;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize);
|
||||
writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64);
|
||||
self.pending_nblocks = 0;
|
||||
}
|
||||
|
||||
@@ -942,7 +964,7 @@ impl<'a, T: DatadirTimeline> DatadirModification<'a, T> {
|
||||
writer.finish_write(lsn);
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * BLCKSZ as isize);
|
||||
writer.update_current_logical_size(pending_nblocks * BLCKSZ as i64);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1368,10 +1390,10 @@ fn is_slru_block_key(key: Key) -> bool {
|
||||
//
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn create_test_timeline<R: Repository>(
|
||||
repo: R,
|
||||
pub fn create_test_timeline(
|
||||
repo: &crate::layered_repository::Repository,
|
||||
timeline_id: utils::zid::ZTimelineId,
|
||||
) -> Result<std::sync::Arc<R::Timeline>> {
|
||||
) -> Result<std::sync::Arc<Timeline>> {
|
||||
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
|
||||
let mut m = tline.begin_modification(Lsn(8));
|
||||
m.init_empty()?;
|
||||
|
||||
@@ -1,19 +1,13 @@
|
||||
use crate::layered_repository::metadata::TimelineMetadata;
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::CheckpointConfig;
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, BE};
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::ops::{AddAssign, Range};
|
||||
use std::sync::{Arc, RwLockReadGuard};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
lsn::{Lsn, RecordLsn},
|
||||
zid::ZTimelineId,
|
||||
};
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
/// Key used in the Repository kv-store.
|
||||
@@ -181,78 +175,6 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository: Send + Sync {
|
||||
type Timeline: crate::DatadirTimeline;
|
||||
|
||||
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
|
||||
/// See [`crate::remote_storage`] for more details about the synchronization.
|
||||
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
/// This function is idempotent. It doesn't change internal state in any way.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Option<RepositoryTimeline<Self::Timeline>>;
|
||||
|
||||
/// Get Timeline handle for locally available timeline. Load it into memory if it is not loaded.
|
||||
fn get_timeline_load(&self, timelineid: ZTimelineId) -> Result<Arc<Self::Timeline>>;
|
||||
|
||||
/// Lists timelines the repository contains.
|
||||
/// Up to repository's implementation to omit certain timelines that ar not considered ready for use.
|
||||
fn list_timelines(&self) -> Vec<(ZTimelineId, RepositoryTimeline<Self::Timeline>)>;
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Result<Arc<Self::Timeline>>;
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(
|
||||
&self,
|
||||
src: ZTimelineId,
|
||||
dst: ZTimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Flush all data to disk.
|
||||
///
|
||||
/// this is used at graceful shutdown.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
/// this function is periodically called by gc thread.
|
||||
/// also it can be explicitly requested through page server api 'do_gc' command.
|
||||
///
|
||||
/// 'timelineid' specifies the timeline to GC, or None for all.
|
||||
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
|
||||
/// `checkpoint_before_gc` parameter is used to force compaction of storage before GC
|
||||
/// to make tests more deterministic.
|
||||
/// TODO Do we still need it or we can call checkpoint explicitly in tests where needed?
|
||||
fn gc_iteration(
|
||||
&self,
|
||||
timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult>;
|
||||
|
||||
/// Perform one compaction iteration.
|
||||
/// This function is periodically called by compactor thread.
|
||||
/// Also it can be explicitly requested per timeline through page server
|
||||
/// api's 'compact' command.
|
||||
fn compaction_iteration(&self) -> Result<()>;
|
||||
|
||||
/// removes timeline-related in-memory data
|
||||
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>;
|
||||
|
||||
/// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
|
||||
fn get_remote_index(&self) -> &RemoteIndex;
|
||||
}
|
||||
|
||||
/// A timeline, that belongs to the current repository.
|
||||
pub enum RepositoryTimeline<T> {
|
||||
/// Timeline, with its files present locally in pageserver's working directory.
|
||||
@@ -304,621 +226,3 @@ impl AddAssign for GcResult {
|
||||
self.elapsed += other.elapsed;
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Timeline: Send + Sync {
|
||||
//------------------------------------------------------------------------------
|
||||
// Public GET functions
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
///
|
||||
/// Wait until WAL has been received and processed up to this LSN.
|
||||
///
|
||||
/// You should call this before any of the other get_* or list_* functions. Calling
|
||||
/// those functions with an LSN that has been processed yet is an error.
|
||||
///
|
||||
fn wait_lsn(&self, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Lock and get timeline's GC cuttof
|
||||
fn get_latest_gc_cutoff_lsn(&self) -> RwLockReadGuard<Lsn>;
|
||||
|
||||
/// Look up given page version.
|
||||
///
|
||||
/// NOTE: It is considered an error to 'get' a key that doesn't exist. The abstraction
|
||||
/// above this needs to store suitable metadata to track what data exists with
|
||||
/// what keys, in separate metadata entries. If a non-existent key is requested,
|
||||
/// the Repository implementation may incorrectly return a value from an ancestor
|
||||
/// branch, for example, or waste a lot of cycles chasing the non-existing key.
|
||||
///
|
||||
fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes>;
|
||||
|
||||
/// Get the ancestor's timeline id
|
||||
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId>;
|
||||
|
||||
/// Get the LSN where this branch was created
|
||||
fn get_ancestor_lsn(&self) -> Lsn;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
/// Atomically get both last and prev.
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn;
|
||||
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
|
||||
fn get_disk_consistent_lsn(&self) -> Lsn;
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
///
|
||||
/// FIXME: This ought to return &'a TimelineWriter, where TimelineWriter
|
||||
/// is a generic type in this trait. But that doesn't currently work in
|
||||
/// Rust: https://rust-lang.github.io/rfcs/1598-generic_associated_types.html
|
||||
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
|
||||
|
||||
///
|
||||
/// Check that it is valid to request operations with that lsn.
|
||||
fn check_lsn_is_in_scope(
|
||||
&self,
|
||||
lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: &RwLockReadGuard<Lsn>,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Get the physical size of the timeline at the latest LSN
|
||||
fn get_physical_size(&self) -> u64;
|
||||
/// Get the physical size of the timeline at the latest LSN non incrementally
|
||||
fn get_physical_size_non_incremental(&self) -> Result<u64>;
|
||||
}
|
||||
|
||||
/// Various functions to mutate the timeline.
|
||||
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
|
||||
// This is probably considered a bad practice in Rust and should be fixed eventually,
|
||||
// but will cause large code changes.
|
||||
pub trait TimelineWriter<'a> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>;
|
||||
|
||||
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Track the end of the latest digested WAL record.
|
||||
///
|
||||
/// Call this after you have finished writing all the WAL up to 'lsn'.
|
||||
///
|
||||
/// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
|
||||
/// the 'lsn' or anything older. The previous last record LSN is stored alongside
|
||||
/// the latest and can be read.
|
||||
fn finish_write(&self, lsn: Lsn);
|
||||
|
||||
fn update_current_logical_size(&self, delta: isize);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod repo_harness {
|
||||
use bytes::BytesMut;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::LayeredRepository,
|
||||
walredo::{WalRedoError, WalRedoManager},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use hex_literal::hex;
|
||||
use utils::zid::ZTenantId;
|
||||
|
||||
pub const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
pub const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
pub fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(64, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
|
||||
|
||||
impl From<TenantConf> for TenantConfOpt {
|
||||
fn from(tenant_conf: TenantConf) -> Self {
|
||||
Self {
|
||||
checkpoint_distance: Some(tenant_conf.checkpoint_distance),
|
||||
checkpoint_timeout: Some(tenant_conf.checkpoint_timeout),
|
||||
compaction_target_size: Some(tenant_conf.compaction_target_size),
|
||||
compaction_period: Some(tenant_conf.compaction_period),
|
||||
compaction_threshold: Some(tenant_conf.compaction_threshold),
|
||||
gc_horizon: Some(tenant_conf.gc_horizon),
|
||||
gc_period: Some(tenant_conf.gc_period),
|
||||
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
|
||||
pitr_interval: Some(tenant_conf.pitr_interval),
|
||||
walreceiver_connect_timeout: Some(tenant_conf.walreceiver_connect_timeout),
|
||||
lagging_wal_timeout: Some(tenant_conf.lagging_wal_timeout),
|
||||
max_lsn_wal_lag: Some(tenant_conf.max_lsn_wal_lag),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RepoHarness<'a> {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: ZTenantId,
|
||||
|
||||
pub lock_guard: (
|
||||
Option<RwLockReadGuard<'a, ()>>,
|
||||
Option<RwLockWriteGuard<'a, ()>>,
|
||||
),
|
||||
}
|
||||
|
||||
impl<'a> RepoHarness<'a> {
|
||||
pub fn create(test_name: &'static str) -> Result<Self> {
|
||||
Self::create_internal(test_name, false)
|
||||
}
|
||||
pub fn create_exclusive(test_name: &'static str) -> Result<Self> {
|
||||
Self::create_internal(test_name, true)
|
||||
}
|
||||
fn create_internal(test_name: &'static str, exclusive: bool) -> Result<Self> {
|
||||
let lock_guard = if exclusive {
|
||||
(None, Some(LOCK.write().unwrap()))
|
||||
} else {
|
||||
(Some(LOCK.read().unwrap()), None)
|
||||
};
|
||||
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenant_conf = TenantConf::dummy_conf();
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
lock_guard,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn load(&self) -> LayeredRepository {
|
||||
self.try_load().expect("failed to load test repo")
|
||||
}
|
||||
|
||||
pub fn try_load(&self) -> Result<LayeredRepository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
let repo = LayeredRepository::new(
|
||||
self.conf,
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
RemoteIndex::default(),
|
||||
false,
|
||||
);
|
||||
// populate repo with locally available timelines
|
||||
for timeline_dir_entry in fs::read_dir(self.conf.timelines_path(&self.tenant_id))
|
||||
.expect("should be able to read timelines dir")
|
||||
{
|
||||
let timeline_dir_entry = timeline_dir_entry.unwrap();
|
||||
let timeline_id: ZTimelineId = timeline_dir_entry
|
||||
.path()
|
||||
.file_name()
|
||||
.unwrap()
|
||||
.to_string_lossy()
|
||||
.parse()
|
||||
.unwrap();
|
||||
|
||||
repo.attach_timeline(timeline_id)?;
|
||||
}
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
pub struct TestRedoManager;
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, ZenithWalRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} to get to {}, with {} and {} records",
|
||||
key,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Tests that should work the same with any Repository/Timeline implementation.
|
||||
///
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::repo_harness::*;
|
||||
use super::*;
|
||||
//use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
|
||||
//use std::sync::Arc;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));
|
||||
|
||||
#[test]
|
||||
fn test_basic() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_basic")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x10))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x1f))?, TEST_IMG("foo at 0x10"));
|
||||
assert_eq!(tline.get(*TEST_KEY, Lsn(0x20))?, TEST_IMG("foo at 0x20"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_duplicate_timelines() -> Result<()> {
|
||||
let repo = RepoHarness::create("no_duplicate_timelines")?.load();
|
||||
let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
|
||||
Ok(_) => panic!("duplicate timeline creation should fail"),
|
||||
Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
pub fn test_value(s: &str) -> Value {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
Value::Image(buf.freeze())
|
||||
}
|
||||
|
||||
///
|
||||
/// Test branch creation
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
use std::str::from_utf8;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
||||
#[allow(non_snake_case)]
|
||||
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
|
||||
|
||||
// Insert a value on the timeline
|
||||
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
|
||||
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
|
||||
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
|
||||
writer.finish_write(Lsn(0x40));
|
||||
|
||||
//assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
let new_writer = newtline.writer();
|
||||
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
|
||||
new_writer.finish_write(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40))?)?,
|
||||
"foo at 0x40"
|
||||
);
|
||||
assert_eq!(
|
||||
from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40))?)?,
|
||||
"bar at 0x40"
|
||||
);
|
||||
assert_eq!(
|
||||
from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40))?)?,
|
||||
"foobar at 0x20"
|
||||
);
|
||||
|
||||
//assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_some_layers<T: Timeline>(tline: &T, start_lsn: Lsn) -> Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
#[allow(non_snake_case)]
|
||||
{
|
||||
let writer = tline.writer();
|
||||
// Create a relation on the timeline
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
|
||||
// and compaction works. But it does set the 'cutoff' point so that the cross check
|
||||
// below should fail.
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("we might've already garbage collected needed data"))
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
||||
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(&err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("is earlier than latest GC horizon"));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/*
|
||||
// FIXME: This currently fails to error out. Calling GC doesn't currently
|
||||
// remove the old value, we'd need to work a little harder
|
||||
#[test]
|
||||
fn test_prohibit_get_for_garbage_collected_data() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
|
||||
.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
|
||||
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
|
||||
match tline.get(*TEST_KEY, Lsn(0x25)) {
|
||||
Ok(_) => panic!("request for page should have failed"),
|
||||
Err(err) => assert!(err.to_string().contains("not found at")),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
*/
|
||||
|
||||
#[test]
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
#[test]
|
||||
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
|
||||
// run gc on parent
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
newtline.get(*TEST_KEY, Lsn(0x50))?,
|
||||
TEST_IMG(&format!("foo at {}", Lsn(0x40)))
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load() -> Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
{
|
||||
let repo = harness.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
}
|
||||
|
||||
let repo = harness.load();
|
||||
let tline = repo
|
||||
.get_timeline(TIMELINE_ID)
|
||||
.expect("cannot load timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
|
||||
|
||||
assert!(repo.get_timeline_load(TIMELINE_ID).is_ok());
|
||||
|
||||
let tline = repo
|
||||
.get_timeline(TIMELINE_ID)
|
||||
.expect("cannot load timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn timeline_load_with_ancestor() -> Result<()> {
|
||||
const TEST_NAME: &str = "timeline_load_with_ancestor";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
// create two timelines
|
||||
{
|
||||
let repo = harness.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
}
|
||||
|
||||
// check that both of them are initially unloaded
|
||||
let repo = harness.load();
|
||||
{
|
||||
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
|
||||
|
||||
let tline = repo
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Unloaded { .. }));
|
||||
}
|
||||
// load only child timeline
|
||||
let _ = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("cannot load timeline");
|
||||
|
||||
// check that both, child and ancestor are loaded
|
||||
let tline = repo
|
||||
.get_timeline(NEW_TIMELINE_ID)
|
||||
.expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
|
||||
|
||||
let tline = repo.get_timeline(TIMELINE_ID).expect("cannot get timeline");
|
||||
assert!(matches!(tline, RepositoryTimeline::Loaded(_)));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -156,7 +156,7 @@ use std::{
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use once_cell::sync::{Lazy, OnceCell};
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::{
|
||||
fs,
|
||||
runtime::Runtime,
|
||||
@@ -253,36 +253,20 @@ pub struct SyncStartupData {
|
||||
/// Along with that, scans tenant files local and remote (if the sync gets enabled) to check the initial timeline states.
|
||||
pub fn start_local_timeline_sync(
|
||||
config: &'static PageServerConf,
|
||||
storage: Option<Arc<GenericRemoteStorage>>,
|
||||
) -> anyhow::Result<SyncStartupData> {
|
||||
let local_timeline_files = local_tenant_timeline_files(config)
|
||||
.context("Failed to collect local tenant timeline files")?;
|
||||
|
||||
match config.remote_storage_config.as_ref() {
|
||||
Some(storage_config) => {
|
||||
match GenericRemoteStorage::new(config.workdir.clone(), storage_config)
|
||||
.context("Failed to init the generic remote storage")?
|
||||
{
|
||||
GenericRemoteStorage::Local(local_fs_storage) => {
|
||||
storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
local_fs_storage,
|
||||
storage_config.max_concurrent_syncs,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
}
|
||||
GenericRemoteStorage::S3(s3_bucket_storage) => {
|
||||
storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
s3_bucket_storage,
|
||||
storage_config.max_concurrent_syncs,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
}
|
||||
}
|
||||
.context("Failed to spawn the storage sync thread")
|
||||
}
|
||||
match storage.zip(config.remote_storage_config.as_ref()) {
|
||||
Some((storage, storage_config)) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
local_timeline_files,
|
||||
storage,
|
||||
storage_config.max_concurrent_syncs,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
.context("Failed to spawn the storage sync thread"),
|
||||
None => {
|
||||
info!("No remote storage configured, skipping storage sync, considering all local timelines with correct metadata files enabled");
|
||||
let mut local_timeline_init_statuses = LocalTimelineInitStatuses::new();
|
||||
@@ -810,17 +794,13 @@ pub fn schedule_layer_download(tenant_id: ZTenantId, timeline_id: ZTimelineId) {
|
||||
|
||||
/// Launch a thread to perform remote storage sync tasks.
|
||||
/// See module docs for loop step description.
|
||||
pub(super) fn spawn_storage_sync_thread<P, S>(
|
||||
pub(super) fn spawn_storage_sync_thread(
|
||||
conf: &'static PageServerConf,
|
||||
local_timeline_files: HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>,
|
||||
storage: S,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
max_concurrent_timelines_sync: NonZeroUsize,
|
||||
max_sync_errors: NonZeroU32,
|
||||
) -> anyhow::Result<SyncStartupData>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> anyhow::Result<SyncStartupData> {
|
||||
let sync_queue = SyncQueue::new(max_concurrent_timelines_sync);
|
||||
SYNC_QUEUE
|
||||
.set(sync_queue)
|
||||
@@ -860,7 +840,7 @@ where
|
||||
storage_sync_loop(
|
||||
runtime,
|
||||
conf,
|
||||
(Arc::new(storage), remote_index_clone, sync_queue),
|
||||
(storage, remote_index_clone, sync_queue),
|
||||
max_sync_errors,
|
||||
);
|
||||
Ok(())
|
||||
@@ -873,15 +853,12 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
fn storage_sync_loop<P, S>(
|
||||
fn storage_sync_loop(
|
||||
runtime: Runtime,
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (Arc<S>, RemoteIndex, &SyncQueue),
|
||||
(storage, index, sync_queue): (Arc<GenericRemoteStorage>, RemoteIndex, &SyncQueue),
|
||||
max_sync_errors: NonZeroU32,
|
||||
) where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) {
|
||||
info!("Starting remote storage sync loop");
|
||||
loop {
|
||||
let loop_storage = Arc::clone(&storage);
|
||||
@@ -983,18 +960,14 @@ enum UploadStatus {
|
||||
Nothing,
|
||||
}
|
||||
|
||||
async fn process_batches<P, S>(
|
||||
async fn process_batches(
|
||||
conf: &'static PageServerConf,
|
||||
max_sync_errors: NonZeroU32,
|
||||
storage: Arc<S>,
|
||||
storage: Arc<GenericRemoteStorage>,
|
||||
index: &RemoteIndex,
|
||||
batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
|
||||
sync_queue: &SyncQueue,
|
||||
) -> HashSet<ZTenantId>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> HashSet<ZTenantId> {
|
||||
let mut sync_results = batched_tasks
|
||||
.into_iter()
|
||||
.map(|(sync_id, batch)| {
|
||||
@@ -1030,17 +1003,13 @@ where
|
||||
downloaded_timelines
|
||||
}
|
||||
|
||||
async fn process_sync_task_batch<P, S>(
|
||||
async fn process_sync_task_batch(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (Arc<S>, RemoteIndex, &SyncQueue),
|
||||
(storage, index, sync_queue): (Arc<GenericRemoteStorage>, RemoteIndex, &SyncQueue),
|
||||
max_sync_errors: NonZeroU32,
|
||||
sync_id: ZTenantTimelineId,
|
||||
batch: SyncTaskBatch,
|
||||
) -> DownloadStatus
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> DownloadStatus {
|
||||
let sync_start = Instant::now();
|
||||
let current_remote_timeline = { index.read().await.timeline_entry(&sync_id).cloned() };
|
||||
|
||||
@@ -1175,19 +1144,15 @@ where
|
||||
download_status
|
||||
}
|
||||
|
||||
async fn download_timeline_data<P, S>(
|
||||
async fn download_timeline_data(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
|
||||
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
|
||||
current_remote_timeline: Option<&RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
new_download_data: SyncData<LayersDownload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) -> DownloadStatus
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> DownloadStatus {
|
||||
match download_timeline_layers(
|
||||
conf,
|
||||
storage,
|
||||
@@ -1298,17 +1263,14 @@ async fn update_local_metadata(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn delete_timeline_data<P, S>(
|
||||
async fn delete_timeline_data(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
|
||||
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut new_delete_data: SyncData<LayersDeletion>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) {
|
||||
let timeline_delete = &mut new_delete_data.data;
|
||||
|
||||
if !timeline_delete.deletion_registered {
|
||||
@@ -1343,19 +1305,15 @@ async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMeta
|
||||
.context("Failed to parse metadata bytes")
|
||||
}
|
||||
|
||||
async fn upload_timeline_data<P, S>(
|
||||
async fn upload_timeline_data(
|
||||
conf: &'static PageServerConf,
|
||||
(storage, index, sync_queue): (&S, &RemoteIndex, &SyncQueue),
|
||||
(storage, index, sync_queue): (&GenericRemoteStorage, &RemoteIndex, &SyncQueue),
|
||||
current_remote_timeline: Option<&RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
new_upload_data: SyncData<LayersUpload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) -> UploadStatus
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> UploadStatus {
|
||||
let mut uploaded_data = match upload_timeline_layers(
|
||||
storage,
|
||||
sync_queue,
|
||||
@@ -1406,17 +1364,13 @@ enum RemoteDataUpdate<'a> {
|
||||
Delete(&'a HashSet<PathBuf>),
|
||||
}
|
||||
|
||||
async fn update_remote_data<P, S>(
|
||||
async fn update_remote_data(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
storage: &GenericRemoteStorage,
|
||||
index: &RemoteIndex,
|
||||
sync_id: ZTenantTimelineId,
|
||||
update: RemoteDataUpdate<'_>,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let updated_remote_timeline = {
|
||||
let mut index_accessor = index.write().await;
|
||||
|
||||
@@ -1642,7 +1596,7 @@ fn register_sync_status(
|
||||
mod test_utils {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::repository::repo_harness::RepoHarness;
|
||||
use crate::layered_repository::repo_harness::RepoHarness;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -1687,7 +1641,7 @@ mod test_utils {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::test_utils::dummy_metadata;
|
||||
use crate::repository::repo_harness::TIMELINE_ID;
|
||||
use crate::layered_repository::repo_harness::TIMELINE_ID;
|
||||
use hex_literal::hex;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
|
||||
@@ -1,27 +1,25 @@
|
||||
//! Timeline synchronization logic to delete a bulk of timeline's remote files from the remote storage.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
use crate::storage_sync::{SyncQueue, SyncTask};
|
||||
use remote_storage::RemoteStorage;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use super::{LayersDeletion, SyncData};
|
||||
|
||||
/// Attempts to remove the timleline layers from the remote storage.
|
||||
/// If the task had not adjusted the metadata before, the deletion will fail.
|
||||
pub(super) async fn delete_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
pub(super) async fn delete_timeline_layers<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
sync_queue: &SyncQueue,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut delete_data: SyncData<LayersDeletion>,
|
||||
) -> bool
|
||||
where
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> bool {
|
||||
if !delete_data.data.deletion_registered {
|
||||
error!("Cannot delete timeline layers before the deletion metadata is not registered, reenqueueing");
|
||||
delete_data.retries += 1;
|
||||
@@ -45,25 +43,14 @@ where
|
||||
let mut delete_tasks = layers_to_delete
|
||||
.into_iter()
|
||||
.map(|local_layer_path| async {
|
||||
let storage_path =
|
||||
match storage
|
||||
.remote_object_id(&local_layer_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_layer_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(path) => path,
|
||||
Err(e) => return Err((e, local_layer_path)),
|
||||
};
|
||||
|
||||
match storage.delete(&storage_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to delete remote layer from storage at '{:?}'",
|
||||
storage_path
|
||||
)
|
||||
}) {
|
||||
match match storage {
|
||||
GenericRemoteStorage::Local(storage) => {
|
||||
remove_storage_object(storage, &local_layer_path).await
|
||||
}
|
||||
GenericRemoteStorage::S3(storage) => {
|
||||
remove_storage_object(storage, &local_layer_path).await
|
||||
}
|
||||
} {
|
||||
Ok(()) => Ok(local_layer_path),
|
||||
Err(e) => Err((e, local_layer_path)),
|
||||
}
|
||||
@@ -101,6 +88,28 @@ where
|
||||
errored
|
||||
}
|
||||
|
||||
async fn remove_storage_object<P, S>(storage: &S, local_layer_path: &Path) -> anyhow::Result<()>
|
||||
where
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
let storage_path = storage
|
||||
.remote_object_id(local_layer_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
local_layer_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage.delete(&storage_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to delete remote layer from storage at '{:?}'",
|
||||
storage_path
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{collections::HashSet, num::NonZeroUsize};
|
||||
@@ -111,10 +120,10 @@ mod tests {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::test_utils::{create_local_timeline, dummy_metadata},
|
||||
};
|
||||
use remote_storage::LocalFs;
|
||||
use remote_storage::{LocalFs, RemoteStorage};
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -123,10 +132,10 @@ mod tests {
|
||||
let harness = RepoHarness::create("delete_timeline_negative")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
)?);
|
||||
|
||||
let deleted = delete_timeline_layers(
|
||||
&storage,
|
||||
@@ -158,17 +167,20 @@ mod tests {
|
||||
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "c", "d"];
|
||||
let storage = LocalFs::new(
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
)?);
|
||||
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
let remote_path = storage.remote_object_id(&local_path)?;
|
||||
let remote_path = local_storage.remote_object_id(&local_path)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
@@ -176,11 +188,11 @@ mod tests {
|
||||
fs::copy(&local_path, &remote_path).await?;
|
||||
}
|
||||
assert_eq!(
|
||||
storage
|
||||
local_storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| storage.local_path(&remote_path).unwrap())
|
||||
.map(|remote_path| local_storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
@@ -213,11 +225,11 @@ mod tests {
|
||||
assert!(deleted, "Should be able to delete timeline files");
|
||||
|
||||
assert_eq!(
|
||||
storage
|
||||
local_storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|remote_path| storage.local_path(&remote_path).unwrap())
|
||||
.map(|remote_path| local_storage.local_path(&remote_path).unwrap())
|
||||
.filter_map(|local_path| { Some(local_path.file_name()?.to_str()?.to_owned()) })
|
||||
.sorted()
|
||||
.collect::<Vec<_>>(),
|
||||
|
||||
@@ -9,7 +9,9 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage};
|
||||
use remote_storage::{
|
||||
path_with_suffix_extension, Download, DownloadError, GenericRemoteStorage, RemoteStorage,
|
||||
};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncWriteExt},
|
||||
@@ -62,15 +64,11 @@ impl Default for TenantIndexParts {
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download_index_parts<P, S>(
|
||||
pub async fn download_index_parts(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
storage: &GenericRemoteStorage,
|
||||
keys: HashSet<ZTenantTimelineId>,
|
||||
) -> HashMap<ZTenantId, TenantIndexParts>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> HashMap<ZTenantId, TenantIndexParts> {
|
||||
let mut index_parts: HashMap<ZTenantId, TenantIndexParts> = HashMap::new();
|
||||
|
||||
let mut part_downloads = keys
|
||||
@@ -114,60 +112,17 @@ where
|
||||
/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests.
|
||||
/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines).
|
||||
/// And then will attempt to download all index files that belong to these timelines.
|
||||
pub async fn gather_tenant_timelines_index_parts<P, S>(
|
||||
pub async fn gather_tenant_timelines_index_parts(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>>
|
||||
where
|
||||
P: RemoteObjectName + Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>> {
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines = storage
|
||||
.list_prefixes(Some(tenant_storage_path))
|
||||
let timeline_sync_ids = get_timeline_sync_ids(storage, &tenant_path, tenant_id)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to list tenant storage path to get remote timelines to download: {}",
|
||||
tenant_id
|
||||
)
|
||||
})?;
|
||||
.with_context(|| format!("Failed to list timeline sync ids for tenat {tenant_id}"))?;
|
||||
|
||||
if timelines.is_empty() {
|
||||
anyhow::bail!(
|
||||
"no timelines found on the remote storage for tenant {}",
|
||||
tenant_id
|
||||
)
|
||||
}
|
||||
|
||||
let mut sync_ids = HashSet::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: ZTimelineId = object_name
|
||||
.parse()
|
||||
.with_context(|| {
|
||||
format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'")
|
||||
})?;
|
||||
|
||||
sync_ids.insert(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
}
|
||||
|
||||
match download_index_parts(conf, storage, sync_ids)
|
||||
match download_index_parts(conf, storage, timeline_sync_ids)
|
||||
.await
|
||||
.remove(&tenant_id)
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))?
|
||||
@@ -180,29 +135,15 @@ where
|
||||
}
|
||||
|
||||
/// Retrieves index data from the remote storage for a given timeline.
|
||||
async fn download_index_part<P, S>(
|
||||
async fn download_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> Result<IndexPart, DownloadError>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let part_storage_path = storage
|
||||
.remote_object_id(&index_part_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
let mut index_part_download = storage.download(&part_storage_path).await?;
|
||||
let mut index_part_download = download_storage_object(storage, &index_part_path).await?;
|
||||
|
||||
let mut index_part_bytes = Vec::new();
|
||||
io::copy(
|
||||
@@ -211,14 +152,18 @@ where
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to download an index part from storage path {part_storage_path:?}")
|
||||
format!(
|
||||
"Failed to download an index part into file '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize index part file from storage path '{part_storage_path:?}'"
|
||||
"Failed to deserialize index part file into file '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?;
|
||||
@@ -249,18 +194,14 @@ pub(super) enum DownloadedTimeline {
|
||||
/// updated in the end, if the remote one contains a newer disk_consistent_lsn.
|
||||
///
|
||||
/// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task.
|
||||
pub(super) async fn download_timeline_layers<'a, P, S>(
|
||||
pub(super) async fn download_timeline_layers<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a S,
|
||||
storage: &'a GenericRemoteStorage,
|
||||
sync_queue: &'a SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut download_data: SyncData<LayersDownload>,
|
||||
) -> DownloadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> DownloadedTimeline {
|
||||
let remote_timeline = match remote_timeline {
|
||||
Some(remote_timeline) => {
|
||||
if !remote_timeline.awaits_download {
|
||||
@@ -300,15 +241,6 @@ where
|
||||
layer_desination_path.display()
|
||||
);
|
||||
} else {
|
||||
let layer_storage_path = storage
|
||||
.remote_object_id(&layer_desination_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
layer_desination_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
@@ -329,19 +261,23 @@ where
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut download = storage
|
||||
.download(&layer_storage_path)
|
||||
|
||||
let mut layer_download = download_storage_object(storage, &layer_desination_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
|
||||
"Failed to initiate the download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
io::copy(&mut layer_download.download_stream, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download the layer for {sync_id} into file '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
@@ -429,6 +365,121 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_storage_object(
|
||||
storage: &GenericRemoteStorage,
|
||||
to_path: &Path,
|
||||
) -> Result<Download, DownloadError> {
|
||||
async fn do_download_storage_object<P, S>(
|
||||
storage: &S,
|
||||
to_path: &Path,
|
||||
) -> Result<Download, DownloadError>
|
||||
where
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
let remote_object_path = storage
|
||||
.remote_object_id(to_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the storage path for target local path '{}'",
|
||||
to_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
|
||||
storage.download(&remote_object_path).await
|
||||
}
|
||||
|
||||
match storage {
|
||||
GenericRemoteStorage::Local(storage) => do_download_storage_object(storage, to_path).await,
|
||||
GenericRemoteStorage::S3(storage) => do_download_storage_object(storage, to_path).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_timeline_sync_ids(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_path: &Path,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<HashSet<ZTenantTimelineId>> {
|
||||
let timeline_ids: Vec<ZTimelineId> = match storage {
|
||||
GenericRemoteStorage::Local(storage) => list_prefixes(storage, tenant_path)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|timeline_directory_path| {
|
||||
timeline_directory_path
|
||||
.file_stem()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get timeline id string from file '{}'",
|
||||
timeline_directory_path.display()
|
||||
)
|
||||
})?
|
||||
.to_string_lossy()
|
||||
.as_ref()
|
||||
.parse()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to parse directory name '{}' as timeline id",
|
||||
timeline_directory_path.display()
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<_>>(),
|
||||
GenericRemoteStorage::S3(storage) => list_prefixes(storage, tenant_path)
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|s3_path| {
|
||||
s3_path
|
||||
.object_name()
|
||||
.with_context(|| {
|
||||
format!("Failed to get object name out of S3 path {s3_path:?}")
|
||||
})?
|
||||
.parse()
|
||||
.with_context(|| {
|
||||
format!("failed to parse object name '{s3_path:?}' as timeline id")
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<_>>(),
|
||||
}
|
||||
.with_context(|| {
|
||||
format!("Tenant {tenant_id} has at least one incorrect timeline subdirectory")
|
||||
})?;
|
||||
|
||||
if timeline_ids.is_empty() {
|
||||
anyhow::bail!("no timelines found on the remote storage for tenant {tenant_id}")
|
||||
}
|
||||
|
||||
Ok(timeline_ids
|
||||
.into_iter()
|
||||
.map(|timeline_id| ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn list_prefixes<P, S>(storage: &S, tenant_path: &Path) -> anyhow::Result<Vec<P>>
|
||||
where
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
let tenant_storage_path = storage.remote_object_id(tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.list_prefixes(Some(&tenant_storage_path))
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to list tenant storage path {tenant_storage_path:?} to get remote timelines to download"
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
async fn fsync_path(path: impl AsRef<Path>) -> Result<(), io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
}
|
||||
@@ -445,7 +496,7 @@ mod tests {
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
@@ -461,10 +512,11 @@ mod tests {
|
||||
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
@@ -472,7 +524,7 @@ mod tests {
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
let remote_path = storage.remote_object_id(&local_path)?;
|
||||
let remote_path = local_storage.remote_object_id(&local_path)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
@@ -558,7 +610,10 @@ mod tests {
|
||||
let harness = RepoHarness::create("download_timeline_negatives")?;
|
||||
let sync_queue = SyncQueue::new(NonZeroUsize::new(100).unwrap());
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
|
||||
let empty_remote_timeline_download = download_timeline_layers(
|
||||
harness.conf,
|
||||
@@ -614,10 +669,11 @@ mod tests {
|
||||
let harness = RepoHarness::create("test_download_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
@@ -638,7 +694,7 @@ mod tests {
|
||||
metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let storage_path = storage.remote_object_id(&local_index_part_path)?;
|
||||
let storage_path = local_storage.remote_object_id(&local_index_part_path)?;
|
||||
fs::create_dir_all(storage_path.parent().unwrap()).await?;
|
||||
fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?;
|
||||
|
||||
|
||||
@@ -210,7 +210,7 @@ impl RemoteTimelineIndex {
|
||||
}
|
||||
|
||||
/// Restored index part data about the timeline, stored in the remote index.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteTimeline {
|
||||
timeline_layers: HashSet<PathBuf>,
|
||||
missing_layers: HashSet<PathBuf>,
|
||||
@@ -341,7 +341,7 @@ mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion() {
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
|
||||
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
use std::{
|
||||
fmt::Debug,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use once_cell::sync::Lazy;
|
||||
use remote_storage::RemoteStorage;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
@@ -30,16 +33,12 @@ static NO_LAYERS_UPLOAD: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<P, S>(
|
||||
pub(super) async fn upload_index_part(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_id: ZTenantTimelineId,
|
||||
index_part: IndexPart,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> anyhow::Result<()> {
|
||||
let index_part_bytes = serde_json::to_vec(&index_part)
|
||||
.context("Failed to serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
@@ -48,27 +47,9 @@ where
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let index_part_storage_path =
|
||||
storage
|
||||
.remote_object_id(&index_part_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(
|
||||
index_part_bytes,
|
||||
index_part_size,
|
||||
&index_part_storage_path,
|
||||
None,
|
||||
)
|
||||
upload_storage_object(storage, index_part_bytes, index_part_size, &index_part_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to upload index part to the storage path '{index_part_storage_path:?}'")
|
||||
})
|
||||
.with_context(|| format!("Failed to upload index part for '{sync_id}'"))
|
||||
}
|
||||
|
||||
/// Timeline upload result, with extra data, needed for uploading.
|
||||
@@ -84,17 +65,13 @@ pub(super) enum UploadedTimeline {
|
||||
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
pub(super) async fn upload_timeline_layers<'a>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
sync_queue: &SyncQueue,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut upload_data: SyncData<LayersUpload>,
|
||||
) -> UploadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
) -> UploadedTimeline {
|
||||
let upload = &mut upload_data.data;
|
||||
let new_upload_lsn = upload
|
||||
.metadata
|
||||
@@ -132,16 +109,6 @@ where
|
||||
let mut upload_tasks = layers_to_upload
|
||||
.into_iter()
|
||||
.map(|source_path| async move {
|
||||
let storage_path = storage
|
||||
.remote_object_id(&source_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(UploadError::Other)?;
|
||||
|
||||
let source_file = match fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upen a source file for layer '{}'",
|
||||
@@ -164,15 +131,10 @@ where
|
||||
.map_err(UploadError::Other)?
|
||||
.len() as usize;
|
||||
|
||||
match storage
|
||||
.upload(source_file, source_size, &storage_path, None)
|
||||
match upload_storage_object(storage, source_file, source_size, &source_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload a layer from local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}) {
|
||||
.with_context(|| format!("Failed to upload layer file for {sync_id}"))
|
||||
{
|
||||
Ok(()) => Ok(source_path),
|
||||
Err(e) => Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
}
|
||||
@@ -231,6 +193,51 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_storage_object(
|
||||
storage: &GenericRemoteStorage,
|
||||
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
from_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn do_upload_storage_object<P, S>(
|
||||
storage: &S,
|
||||
from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
from_path: &Path,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
let target_storage_path = storage.remote_object_id(from_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the storage path for source local path '{}'",
|
||||
from_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(from, from_size_bytes, &target_storage_path, None)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload from '{}' to storage path '{:?}'",
|
||||
from_path.display(),
|
||||
target_storage_path
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
match storage {
|
||||
GenericRemoteStorage::Local(storage) => {
|
||||
do_upload_storage_object(storage, from, from_size_bytes, from_path).await
|
||||
}
|
||||
GenericRemoteStorage::S3(storage) => {
|
||||
do_upload_storage_object(storage, from, from_size_bytes, from_path).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum UploadError {
|
||||
MissingLocalFile(PathBuf, anyhow::Error),
|
||||
Other(anyhow::Error),
|
||||
@@ -243,12 +250,12 @@ mod tests {
|
||||
num::NonZeroUsize,
|
||||
};
|
||||
|
||||
use remote_storage::LocalFs;
|
||||
use remote_storage::{LocalFs, RemoteStorage};
|
||||
use tempfile::tempdir;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
layered_repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
@@ -264,10 +271,11 @@ mod tests {
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a", "b"];
|
||||
let storage = LocalFs::new(
|
||||
tempdir()?.path().to_path_buf(),
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?;
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
@@ -276,7 +284,7 @@ mod tests {
|
||||
timeline_upload.metadata = None;
|
||||
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
local_storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
|
||||
@@ -322,7 +330,7 @@ mod tests {
|
||||
"Successful upload without metadata should not have it returned either"
|
||||
);
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
let storage_files = local_storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
@@ -331,7 +339,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path))
|
||||
.map(|storage_path| local_storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
@@ -351,7 +359,11 @@ mod tests {
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a1", "b1"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let current_retries = 5;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
|
||||
@@ -365,7 +377,7 @@ mod tests {
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone())
|
||||
.await?;
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
local_storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
|
||||
@@ -414,7 +426,7 @@ mod tests {
|
||||
"Successful upload should not change its metadata"
|
||||
);
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
let storage_files = local_storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
@@ -423,7 +435,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path))
|
||||
.map(|storage_path| local_storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
@@ -440,7 +452,11 @@ mod tests {
|
||||
let harness = RepoHarness::create("test_upload_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), harness.conf.workdir.clone())?;
|
||||
let storage = GenericRemoteStorage::Local(LocalFs::new(
|
||||
tempdir()?.path().to_owned(),
|
||||
harness.conf.workdir.clone(),
|
||||
)?);
|
||||
let local_storage = storage.as_local().unwrap();
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
@@ -458,12 +474,12 @@ mod tests {
|
||||
);
|
||||
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
local_storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?;
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
let storage_files = local_storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
1,
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::http::models::TenantInfo;
|
||||
use crate::layered_repository::{load_metadata, LayeredRepository, LayeredTimeline};
|
||||
use crate::repository::Repository;
|
||||
use crate::layered_repository::{load_metadata, Repository, Timeline};
|
||||
use crate::repository::RepositoryTimeline;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
|
||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
@@ -12,6 +12,7 @@ use crate::thread_mgr::ThreadKind;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{thread_mgr, timelines, walreceiver};
|
||||
use anyhow::Context;
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
@@ -21,6 +22,7 @@ use tokio::sync::mpsc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub use tenants_state::try_send_timeline_update;
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
mod tenants_state {
|
||||
@@ -68,7 +70,7 @@ mod tenants_state {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(super) fn try_send_timeline_update(update: LocalTimelineUpdate) {
|
||||
pub fn try_send_timeline_update(update: LocalTimelineUpdate) {
|
||||
match TIMELINE_UPDATE_SENDER
|
||||
.read()
|
||||
.expect("Failed to read() timeline_update_sender lock, it got poisoned")
|
||||
@@ -94,13 +96,7 @@ mod tenants_state {
|
||||
struct Tenant {
|
||||
state: TenantState,
|
||||
/// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk.
|
||||
repo: Arc<LayeredRepository>,
|
||||
/// Timelines, located locally in the pageserver's datadir.
|
||||
/// Timelines can entirely be removed entirely by the `detach` operation only.
|
||||
///
|
||||
/// Local timelines have more metadata that's loaded into memory,
|
||||
/// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`].
|
||||
local_timelines: HashMap<ZTimelineId, Arc<LayeredTimeline>>,
|
||||
repo: Arc<Repository>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -136,7 +132,10 @@ impl fmt::Display for TenantState {
|
||||
/// Initialize repositories with locally available timelines.
|
||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
/// are scheduled for download and added to the repository once download is completed.
|
||||
pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIndex> {
|
||||
pub fn init_tenant_mgr(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<Arc<GenericRemoteStorage>>,
|
||||
) -> anyhow::Result<RemoteIndex> {
|
||||
let (timeline_updates_sender, timeline_updates_receiver) =
|
||||
mpsc::unbounded_channel::<LocalTimelineUpdate>();
|
||||
tenants_state::set_timeline_update_sender(timeline_updates_sender)?;
|
||||
@@ -145,7 +144,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = storage_sync::start_local_timeline_sync(conf)
|
||||
} = storage_sync::start_local_timeline_sync(conf, remote_storage)
|
||||
.context("Failed to set up local files sync with external storage")?;
|
||||
|
||||
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
|
||||
@@ -177,15 +176,15 @@ pub enum LocalTimelineUpdate {
|
||||
},
|
||||
Attach {
|
||||
id: ZTenantTimelineId,
|
||||
datadir: Arc<LayeredTimeline>,
|
||||
timeline: Arc<Timeline>,
|
||||
},
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for LocalTimelineUpdate {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(),
|
||||
Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(),
|
||||
Self::Detach { id, .. } => f.debug_tuple("Detach").field(id).finish(),
|
||||
Self::Attach { id, .. } => f.debug_tuple("Attach").field(id).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -289,7 +288,6 @@ pub fn create_tenant_repository(
|
||||
v.insert(Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
local_timelines: HashMap::new(),
|
||||
});
|
||||
Ok(Some(tenant_id))
|
||||
}
|
||||
@@ -365,7 +363,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<LayeredRepository>> {
|
||||
pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<Repository>> {
|
||||
let m = tenants_state::read_tenants();
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
@@ -379,21 +377,15 @@ pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<Lay
|
||||
pub fn get_local_timeline_with_load(
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) -> anyhow::Result<Arc<LayeredTimeline>> {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
let tenant = m
|
||||
.get_mut(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))?;
|
||||
|
||||
if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) {
|
||||
Ok(Arc::clone(page_tline))
|
||||
} else {
|
||||
let page_tline = load_local_timeline(&tenant.repo, timeline_id)
|
||||
.with_context(|| format!("Failed to load local timeline for tenant {tenant_id}"))?;
|
||||
tenant
|
||||
.local_timelines
|
||||
.insert(timeline_id, Arc::clone(&page_tline));
|
||||
Ok(page_tline)
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let repository = get_repository_for_tenant(tenant_id)?;
|
||||
match repository.get_timeline(timeline_id) {
|
||||
Some(RepositoryTimeline::Loaded(loaded_timeline)) => {
|
||||
loaded_timeline.init_logical_size()?;
|
||||
Ok(loaded_timeline)
|
||||
}
|
||||
_ => load_local_timeline(&repository, timeline_id)
|
||||
.with_context(|| format!("Failed to load local timeline for tenant {tenant_id}")),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -420,10 +412,7 @@ pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow
|
||||
thread_mgr::shutdown_threads(None, None, Some(timeline_id));
|
||||
debug!("thread shutdown completed");
|
||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant.repo.delete_timeline(timeline_id)?;
|
||||
tenant.local_timelines.remove(&timeline_id);
|
||||
}
|
||||
Some(tenant) => tenant.repo.delete_timeline(timeline_id)?,
|
||||
None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"),
|
||||
}
|
||||
|
||||
@@ -435,23 +424,21 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any
|
||||
// shutdown the tenant and timeline threads: gc, compaction, page service threads)
|
||||
thread_mgr::shutdown_threads(None, Some(tenant_id), None);
|
||||
|
||||
// FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state?
|
||||
// send stop signal to wal receiver and collect join handles while holding the lock
|
||||
let walreceiver_join_handles = {
|
||||
let tenants = tenants_state::write_tenants();
|
||||
let tenant = tenants.get(&tenant_id).context("tenant not found")?;
|
||||
let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len());
|
||||
for timeline_id in tenant.local_timelines.keys() {
|
||||
let mut walreceiver_join_handles = Vec::new();
|
||||
let removed_tenant = {
|
||||
let mut tenants_accessor = tenants_state::write_tenants();
|
||||
tenants_accessor.remove(&tenant_id)
|
||||
};
|
||||
if let Some(tenant) = removed_tenant {
|
||||
for (timeline_id, _) in tenant.repo.list_timelines() {
|
||||
let (sender, receiver) = std::sync::mpsc::channel::<()>();
|
||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
|
||||
id: ZTenantTimelineId::new(tenant_id, *timeline_id),
|
||||
id: ZTenantTimelineId::new(tenant_id, timeline_id),
|
||||
join_confirmation_sender: sender,
|
||||
});
|
||||
walreceiver_join_handles.push((*timeline_id, receiver));
|
||||
walreceiver_join_handles.push((timeline_id, receiver));
|
||||
}
|
||||
// drop the tenants lock
|
||||
walreceiver_join_handles
|
||||
};
|
||||
}
|
||||
|
||||
// wait for wal receivers to stop without holding the lock, because walreceiver
|
||||
// will attempt to change tenant state which is protected by the same global tenants lock.
|
||||
@@ -484,19 +471,13 @@ pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> any
|
||||
}
|
||||
|
||||
fn load_local_timeline(
|
||||
repo: &LayeredRepository,
|
||||
repo: &Repository,
|
||||
timeline_id: ZTimelineId,
|
||||
) -> anyhow::Result<Arc<LayeredTimeline>> {
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| {
|
||||
format!("Inmem timeline {timeline_id} not found in tenant's repository")
|
||||
})?;
|
||||
inmem_timeline.init_logical_size()?;
|
||||
|
||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach {
|
||||
id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
|
||||
datadir: Arc::clone(&inmem_timeline),
|
||||
});
|
||||
|
||||
Ok(inmem_timeline)
|
||||
}
|
||||
|
||||
@@ -588,37 +569,24 @@ fn init_local_repository(
|
||||
}
|
||||
|
||||
fn attach_downloaded_tenant(
|
||||
repo: &LayeredRepository,
|
||||
repo: &Repository,
|
||||
downloaded_timelines: HashSet<ZTimelineId>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut registration_queue = Vec::with_capacity(downloaded_timelines.len());
|
||||
|
||||
// first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
|
||||
for timeline_id in downloaded_timelines {
|
||||
// first, register timeline metadata to ensure ancestors will be found later during layer load
|
||||
for &timeline_id in &downloaded_timelines {
|
||||
repo.attach_timeline(timeline_id).with_context(|| {
|
||||
format!("Failed to load timeline {timeline_id} into in-memory repository")
|
||||
})?;
|
||||
registration_queue.push(timeline_id);
|
||||
}
|
||||
|
||||
for timeline_id in registration_queue {
|
||||
let tenant_id = repo.tenant_id();
|
||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||
Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
|
||||
Entry::Occupied(_) => {
|
||||
anyhow::bail!("Local timeline {timeline_id} already registered")
|
||||
}
|
||||
Entry::Vacant(v) => {
|
||||
v.insert(load_local_timeline(repo, timeline_id).with_context(|| {
|
||||
format!("Failed to register add local timeline for tenant {tenant_id}")
|
||||
})?);
|
||||
}
|
||||
},
|
||||
None => anyhow::bail!(
|
||||
"Tenant {} not found in local tenant state",
|
||||
repo.tenant_id()
|
||||
),
|
||||
}
|
||||
// and then load its layers in memory
|
||||
for timeline_id in downloaded_timelines {
|
||||
let _ = load_local_timeline(repo, timeline_id).with_context(|| {
|
||||
format!(
|
||||
"Failed to register add local timeline for tenant {}",
|
||||
repo.tenant_id(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -630,14 +598,14 @@ fn load_local_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> anyhow::Result<Arc<LayeredRepository>> {
|
||||
) -> anyhow::Result<Arc<Repository>> {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
|
||||
let repo: Arc<Repository> = Arc::new(Repository::new(
|
||||
conf,
|
||||
TenantConfOpt::default(),
|
||||
Arc::new(walredo_mgr),
|
||||
@@ -648,12 +616,11 @@ fn load_local_repo(
|
||||
Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
local_timelines: HashMap::new(),
|
||||
}
|
||||
});
|
||||
|
||||
// Restore tenant config
|
||||
let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?;
|
||||
let tenant_conf = Repository::load_tenant_config(conf, tenant_id)?;
|
||||
tenant.repo.update_tenant_config(tenant_conf)?;
|
||||
|
||||
Ok(Arc::clone(&tenant.repo))
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::collections::HashMap;
|
||||
use std::ops::ControlFlow;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::repository::Repository;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use crate::{tenant_mgr, thread_mgr};
|
||||
|
||||
@@ -20,15 +20,14 @@ use utils::{
|
||||
|
||||
use crate::import_datadir;
|
||||
use crate::tenant_mgr;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{
|
||||
config::PageServerConf, repository::Repository, storage_sync::index::RemoteIndex,
|
||||
tenant_config::TenantConfOpt,
|
||||
config::PageServerConf, storage_sync::index::RemoteIndex, tenant_config::TenantConfOpt,
|
||||
};
|
||||
use crate::{
|
||||
layered_repository::{LayeredRepository, LayeredTimeline},
|
||||
layered_repository::{Repository, Timeline},
|
||||
walredo::WalRedoManager,
|
||||
};
|
||||
use crate::{repository::Timeline, CheckpointConfig};
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct PointInTime {
|
||||
@@ -42,7 +41,7 @@ pub fn create_repo(
|
||||
tenant_id: ZTenantId,
|
||||
wal_redo_manager: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> Result<Arc<LayeredRepository>> {
|
||||
) -> Result<Arc<Repository>> {
|
||||
let repo_dir = conf.tenant_path(&tenant_id);
|
||||
ensure!(
|
||||
!repo_dir.exists(),
|
||||
@@ -57,9 +56,9 @@ pub fn create_repo(
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// Save tenant's config
|
||||
LayeredRepository::persist_tenant_config(conf, tenant_id, tenant_conf)?;
|
||||
Repository::persist_tenant_config(conf, tenant_id, tenant_conf)?;
|
||||
|
||||
Ok(Arc::new(LayeredRepository::new(
|
||||
Ok(Arc::new(Repository::new(
|
||||
conf,
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
@@ -104,11 +103,11 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
// - run initdb to init temporary instance and get bootstrap data
|
||||
// - after initialization complete, remove the temp dir.
|
||||
//
|
||||
fn bootstrap_timeline<R: Repository>(
|
||||
fn bootstrap_timeline(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
tli: ZTimelineId,
|
||||
repo: &R,
|
||||
repo: &Repository,
|
||||
) -> Result<()> {
|
||||
let initdb_path = conf
|
||||
.tenant_path(&tenantid)
|
||||
@@ -160,7 +159,7 @@ pub(crate) fn create_timeline(
|
||||
new_timeline_id: Option<ZTimelineId>,
|
||||
ancestor_timeline_id: Option<ZTimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
) -> Result<Option<(ZTimelineId, Arc<LayeredTimeline>)>> {
|
||||
) -> Result<Option<(ZTimelineId, Arc<Timeline>)>> {
|
||||
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ use anyhow::Result;
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::walrecord::*;
|
||||
@@ -43,15 +44,15 @@ use utils::lsn::Lsn;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub struct WalIngest<'a, T: DatadirTimeline> {
|
||||
timeline: &'a T,
|
||||
pub struct WalIngest<'a> {
|
||||
timeline: &'a Timeline,
|
||||
|
||||
checkpoint: CheckPoint,
|
||||
checkpoint_modified: bool,
|
||||
}
|
||||
|
||||
impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
pub fn new(timeline: &T, startpoint: Lsn) -> Result<WalIngest<T>> {
|
||||
impl<'a> WalIngest<'a> {
|
||||
pub fn new(timeline: &Timeline, startpoint: Lsn) -> Result<WalIngest> {
|
||||
// Fetch the latest checkpoint into memory, so that we can compare with it
|
||||
// quickly in `ingest_record` and update it when it changes.
|
||||
let checkpoint_bytes = timeline.get_checkpoint(startpoint)?;
|
||||
@@ -77,7 +78,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
&mut self,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
decoded: &mut DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
modification.lsn = lsn;
|
||||
@@ -266,7 +267,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn ingest_decoded_block(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
lsn: Lsn,
|
||||
decoded: &DecodedWALRecord,
|
||||
blk: &DecodedBkpBlock,
|
||||
@@ -326,7 +327,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
fn ingest_heapam_record(
|
||||
&mut self,
|
||||
buf: &mut Bytes,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
decoded: &mut DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
// Handle VM bit updates that are implicitly part of heap records.
|
||||
@@ -470,7 +471,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
/// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
|
||||
fn ingest_xlog_dbase_create(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rec: &XlCreateDatabase,
|
||||
) -> Result<()> {
|
||||
let db_id = rec.db_id;
|
||||
@@ -537,7 +538,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn ingest_xlog_smgr_create(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rec: &XlSmgrCreate,
|
||||
) -> Result<()> {
|
||||
let rel = RelTag {
|
||||
@@ -555,7 +556,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
/// This is the same logic as in PostgreSQL's smgr_redo() function.
|
||||
fn ingest_xlog_smgr_truncate(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rec: &XlSmgrTruncate,
|
||||
) -> Result<()> {
|
||||
let spcnode = rec.rnode.spcnode;
|
||||
@@ -620,7 +621,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
///
|
||||
fn ingest_xact_record(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
parsed: &XlXactParsedRecord,
|
||||
is_commit: bool,
|
||||
) -> Result<()> {
|
||||
@@ -689,7 +690,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn ingest_clog_truncate_record(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
xlrec: &XlClogTruncate,
|
||||
) -> Result<()> {
|
||||
info!(
|
||||
@@ -747,7 +748,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn ingest_multixact_create_record(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
) -> Result<()> {
|
||||
// Create WAL record for updating the multixact-offsets page
|
||||
@@ -826,7 +827,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn ingest_multixact_truncate_record(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
self.checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
@@ -860,7 +861,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn ingest_relmap_page(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
@@ -876,7 +877,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn put_rel_creation(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rel: RelTag,
|
||||
) -> Result<()> {
|
||||
modification.put_rel_creation(rel, 0)?;
|
||||
@@ -885,7 +886,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn put_rel_page_image(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
img: Bytes,
|
||||
@@ -897,7 +898,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn put_rel_wal_record(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
rec: ZenithWalRecord,
|
||||
@@ -909,7 +910,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn put_rel_truncation(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rel: RelTag,
|
||||
nblocks: BlockNumber,
|
||||
) -> Result<()> {
|
||||
@@ -917,11 +918,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn put_rel_drop(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
rel: RelTag,
|
||||
) -> Result<()> {
|
||||
fn put_rel_drop(&mut self, modification: &mut DatadirModification, rel: RelTag) -> Result<()> {
|
||||
modification.put_rel_drop(rel)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -937,7 +934,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn handle_rel_extend(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
rel: RelTag,
|
||||
blknum: BlockNumber,
|
||||
) -> Result<()> {
|
||||
@@ -968,7 +965,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn put_slru_page_image(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
@@ -981,7 +978,7 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
|
||||
fn handle_slru_extend(
|
||||
&mut self,
|
||||
modification: &mut DatadirModification<T>,
|
||||
modification: &mut DatadirModification,
|
||||
kind: SlruKind,
|
||||
segno: u32,
|
||||
blknum: BlockNumber,
|
||||
@@ -1032,9 +1029,9 @@ impl<'a, T: DatadirTimeline> WalIngest<'a, T> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::layered_repository::repo_harness::*;
|
||||
use crate::layered_repository::Timeline;
|
||||
use crate::pgdatadir_mapping::create_test_timeline;
|
||||
use crate::repository::repo_harness::*;
|
||||
use crate::repository::Timeline;
|
||||
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
|
||||
@@ -1046,13 +1043,13 @@ mod tests {
|
||||
forknum: 0,
|
||||
};
|
||||
|
||||
fn assert_current_logical_size<T: Timeline>(_timeline: &T, _lsn: Lsn) {
|
||||
fn assert_current_logical_size(_timeline: &Timeline, _lsn: Lsn) {
|
||||
// TODO
|
||||
}
|
||||
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
fn init_walingest_test<T: DatadirTimeline>(tline: &T) -> Result<WalIngest<T>> {
|
||||
fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
|
||||
let mut m = tline.begin_modification(Lsn(0x10));
|
||||
m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
|
||||
m.put_relmap_file(0, 111, Bytes::from(""))?; // dummy relmapper file
|
||||
@@ -1065,7 +1062,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_relsize")?.load();
|
||||
let tline = create_test_timeline(repo, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
@@ -1193,7 +1190,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_drop_extend")?.load();
|
||||
let tline = create_test_timeline(repo, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
@@ -1233,7 +1230,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_truncate_extend")?.load();
|
||||
let tline = create_test_timeline(repo, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
// Create a 20 MB relation (the size is arbitrary)
|
||||
@@ -1321,7 +1318,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = create_test_timeline(repo, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&repo, TIMELINE_ID)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut lsn = 0x10;
|
||||
|
||||
@@ -269,7 +269,7 @@ async fn wal_receiver_main_thread_loop_step<'a>(
|
||||
}
|
||||
}
|
||||
// Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly.
|
||||
LocalTimelineUpdate::Attach { id, datadir } => {
|
||||
LocalTimelineUpdate::Attach { id, timeline } => {
|
||||
let timeline_connection_managers = local_timeline_wal_receivers
|
||||
.entry(id.tenant_id)
|
||||
.or_default();
|
||||
@@ -305,7 +305,7 @@ async fn wal_receiver_main_thread_loop_step<'a>(
|
||||
id,
|
||||
broker_prefix.to_owned(),
|
||||
etcd_client.clone(),
|
||||
datadir,
|
||||
timeline,
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
|
||||
@@ -16,7 +16,7 @@ use std::{
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::{layered_repository::LayeredTimeline, repository::Timeline};
|
||||
use crate::layered_repository::Timeline;
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use etcd_broker::{
|
||||
@@ -39,7 +39,7 @@ pub(super) fn spawn_connection_manager_task(
|
||||
id: ZTenantTimelineId,
|
||||
broker_loop_prefix: String,
|
||||
mut client: Client,
|
||||
local_timeline: Arc<LayeredTimeline>,
|
||||
local_timeline: Arc<Timeline>,
|
||||
wal_connect_timeout: Duration,
|
||||
lagging_wal_timeout: Duration,
|
||||
max_lsn_wal_lag: NonZeroU64,
|
||||
@@ -242,7 +242,7 @@ const WALCONNECTION_RETRY_BACKOFF_MULTIPLIER: f64 = 1.5;
|
||||
struct WalreceiverState {
|
||||
id: ZTenantTimelineId,
|
||||
/// Use pageserver data about the timeline to filter out some of the safekeepers.
|
||||
local_timeline: Arc<LayeredTimeline>,
|
||||
local_timeline: Arc<Timeline>,
|
||||
/// The timeout on the connection to safekeeper for WAL streaming.
|
||||
wal_connect_timeout: Duration,
|
||||
/// The timeout to use to determine when the current connection is "stale" and reconnect to the other one.
|
||||
@@ -300,7 +300,7 @@ struct EtcdSkTimeline {
|
||||
impl WalreceiverState {
|
||||
fn new(
|
||||
id: ZTenantTimelineId,
|
||||
local_timeline: Arc<LayeredTimeline>,
|
||||
local_timeline: Arc<Timeline>,
|
||||
wal_connect_timeout: Duration,
|
||||
lagging_wal_timeout: Duration,
|
||||
max_lsn_wal_lag: NonZeroU64,
|
||||
@@ -735,12 +735,8 @@ fn wal_stream_connection_string(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::repository::{
|
||||
repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
Repository,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::layered_repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
#[test]
|
||||
fn no_connection_no_candidate() -> anyhow::Result<()> {
|
||||
|
||||
@@ -20,11 +20,7 @@ use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
||||
|
||||
use super::TaskEvent;
|
||||
use crate::{
|
||||
layered_repository::WalReceiverInfo,
|
||||
pgdatadir_mapping::DatadirTimeline,
|
||||
repository::{Repository, Timeline},
|
||||
tenant_mgr,
|
||||
walingest::WalIngest,
|
||||
layered_repository::WalReceiverInfo, tenant_mgr, walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
};
|
||||
use postgres_ffi::v14::waldecoder::WalStreamDecoder;
|
||||
@@ -67,7 +63,7 @@ pub async fn handle_walreceiver_connection(
|
||||
)
|
||||
.await
|
||||
.context("Timed out while waiting for walreceiver connection to open")?
|
||||
.context("Failed to open walreceiver conection")?;
|
||||
.context("Failed to open walreceiver connection")?;
|
||||
|
||||
info!("connected!");
|
||||
let mut connection_status = WalConnectionStatus {
|
||||
|
||||
@@ -89,15 +89,52 @@ pub trait WalRedoManager: Send + Sync {
|
||||
// for access to the postgres process ('wait') since there is only one for
|
||||
// each tenant.
|
||||
|
||||
/// Time buckets are small because we want to be able to measure the
|
||||
/// smallest redo processing times. These buckets allow us to measure down
|
||||
/// to 5us, which equates to 200'000 pages/sec, which equates to 1.6GB/sec.
|
||||
/// This is much better than the previous 5ms aka 200 pages/sec aka 1.6MB/sec.
|
||||
macro_rules! redo_histogram_time_buckets {
|
||||
() => {
|
||||
vec![
|
||||
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
|
||||
]
|
||||
};
|
||||
}
|
||||
|
||||
/// While we're at it, also measure the amount of records replayed in each
|
||||
/// operation. We have a global 'total replayed' counter, but that's not
|
||||
/// as useful as 'what is the skew for how many records we replay in one
|
||||
/// operation'.
|
||||
macro_rules! redo_histogram_count_buckets {
|
||||
() => {
|
||||
vec![0.0, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 100.0, 250.0, 500.0]
|
||||
};
|
||||
}
|
||||
|
||||
static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!("pageserver_wal_redo_seconds", "Time spent on WAL redo")
|
||||
.expect("failed to define a metric")
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_seconds",
|
||||
"Time spent on WAL redo",
|
||||
redo_histogram_time_buckets!()
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_wait_seconds",
|
||||
"Time spent waiting for access to the WAL redo process"
|
||||
"Time spent waiting for access to the WAL redo process",
|
||||
redo_histogram_time_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
|
||||
register_histogram!(
|
||||
"pageserver_wal_redo_records_histogram",
|
||||
"Histogram of number of records replayed per redo",
|
||||
redo_histogram_count_buckets!(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
@@ -262,7 +299,10 @@ impl PostgresRedoManager {
|
||||
|
||||
let end_time = Instant::now();
|
||||
let duration = end_time.duration_since(lock_time);
|
||||
|
||||
WAL_REDO_TIME.observe(duration.as_secs_f64());
|
||||
WAL_REDO_RECORDS_HISTOGRAM.observe(records.len() as f64);
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
|
||||
26
pgxn/neon/Makefile
Normal file
26
pgxn/neon/Makefile
Normal file
@@ -0,0 +1,26 @@
|
||||
# pgxs/neon/Makefile
|
||||
|
||||
|
||||
MODULE_big = neon
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
inmem_smgr.o \
|
||||
libpagestore.o \
|
||||
libpqwalproposer.o \
|
||||
pagestore_smgr.o \
|
||||
relsize_cache.o \
|
||||
neon.o \
|
||||
walproposer.o \
|
||||
walproposer_utils.o
|
||||
|
||||
PG_CPPFLAGS = -I$(libpq_srcdir)
|
||||
SHLIB_LINK_INTERNAL = $(libpq)
|
||||
|
||||
EXTENSION = neon
|
||||
DATA = neon--1.0.sql
|
||||
PGFILEDESC = "neon - cloud storage for PostgreSQL"
|
||||
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
286
pgxn/neon/inmem_smgr.c
Normal file
286
pgxn/neon/inmem_smgr.c
Normal file
@@ -0,0 +1,286 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* inmem_smgr.c
|
||||
*
|
||||
* This is an implementation of the SMGR interface, used in the WAL redo
|
||||
* process (see src/backend/tcop/zenith_wal_redo.c). It has no persistent
|
||||
* storage, the pages that are written out are kept in a small number of
|
||||
* in-memory buffers.
|
||||
*
|
||||
* Normally, replaying a WAL record only needs to access a handful of
|
||||
* buffers, which fit in the normal buffer cache, so this is just for
|
||||
* "overflow" storage when the buffer cache is not large enough.
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/inmem_smgr.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlog.h"
|
||||
#include "pagestore_client.h"
|
||||
#include "storage/block.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/relfilenode.h"
|
||||
#include "storage/smgr.h"
|
||||
|
||||
/* Size of the in-memory smgr */
|
||||
#define MAX_PAGES 64
|
||||
|
||||
/* If more than WARN_PAGES are used, print a warning in the log */
|
||||
#define WARN_PAGES 32
|
||||
|
||||
static BufferTag page_tag[MAX_PAGES];
|
||||
static char page_body[MAX_PAGES][BLCKSZ];
|
||||
static int used_pages;
|
||||
|
||||
static int
|
||||
locate_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno)
|
||||
{
|
||||
/* We only hold a small number of pages, so linear search */
|
||||
for (int i = 0; i < used_pages; i++)
|
||||
{
|
||||
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
|
||||
&& forknum == page_tag[i].forkNum
|
||||
&& blkno == page_tag[i].blockNum)
|
||||
{
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_init() -- Initialize private state
|
||||
*/
|
||||
void
|
||||
inmem_init(void)
|
||||
{
|
||||
used_pages = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_exists() -- Does the physical file exist?
|
||||
*/
|
||||
bool
|
||||
inmem_exists(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
for (int i = 0; i < used_pages; i++)
|
||||
{
|
||||
if (RelFileNodeEquals(reln->smgr_rnode.node, page_tag[i].rnode)
|
||||
&& forknum == page_tag[i].forkNum)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_create() -- Create a new relation on zenithd storage
|
||||
*
|
||||
* If isRedo is true, it's okay for the relation to exist already.
|
||||
*/
|
||||
void
|
||||
inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_unlink() -- Unlink a relation.
|
||||
*/
|
||||
void
|
||||
inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_extend() -- Add a block to the specified relation.
|
||||
*
|
||||
* The semantics are nearly the same as mdwrite(): write at the
|
||||
* specified position. However, this is to be used for the case of
|
||||
* extending a relation (i.e., blocknum is at or beyond the current
|
||||
* EOF). Note that we assume writing a block beyond current EOF
|
||||
* causes intervening file space to become filled with zeroes.
|
||||
*/
|
||||
void
|
||||
inmem_extend(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
/* same as smgwrite() for us */
|
||||
inmem_write(reln, forknum, blkno, buffer, skipFsync);
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_open() -- Initialize newly-opened relation.
|
||||
*/
|
||||
void
|
||||
inmem_open(SMgrRelation reln)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_close() -- Close the specified relation, if it isn't closed already.
|
||||
*/
|
||||
void
|
||||
inmem_close(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_prefetch() -- Initiate asynchronous read of the specified block of a relation
|
||||
*/
|
||||
bool
|
||||
inmem_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_writeback() -- Tell the kernel to write pages back to storage.
|
||||
*/
|
||||
void
|
||||
inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_read() -- Read the specified block from a relation.
|
||||
*/
|
||||
void
|
||||
inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
|
||||
char *buffer)
|
||||
{
|
||||
int pg;
|
||||
|
||||
pg = locate_page(reln, forknum, blkno);
|
||||
if (pg < 0)
|
||||
memset(buffer, 0, BLCKSZ);
|
||||
else
|
||||
memcpy(buffer, page_body[pg], BLCKSZ);
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_write() -- Write the supplied block at the appropriate location.
|
||||
*
|
||||
* This is to be used only for updating already-existing blocks of a
|
||||
* relation (ie, those before the current EOF). To extend a relation,
|
||||
* use mdextend().
|
||||
*/
|
||||
void
|
||||
inmem_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer, bool skipFsync)
|
||||
{
|
||||
int pg;
|
||||
|
||||
pg = locate_page(reln, forknum, blocknum);
|
||||
if (pg < 0)
|
||||
{
|
||||
/*
|
||||
* We assume the buffer cache is large enough to hold all the buffers
|
||||
* needed for most operations. Overflowing to this "in-mem smgr" in rare
|
||||
* cases is OK. But if we find that we're using more than WARN_PAGES,
|
||||
* print a warning so that we get alerted and get to investigate why
|
||||
* we're accessing so many buffers.
|
||||
*/
|
||||
elog(used_pages >= WARN_PAGES ? WARNING : DEBUG1,
|
||||
"inmem_write() called for %u/%u/%u.%u blk %u: used_pages %u",
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum,
|
||||
blocknum,
|
||||
used_pages);
|
||||
if (used_pages == MAX_PAGES)
|
||||
elog(ERROR, "Inmem storage overflow");
|
||||
|
||||
pg = used_pages;
|
||||
used_pages++;
|
||||
INIT_BUFFERTAG(page_tag[pg], reln->smgr_rnode.node, forknum, blocknum);
|
||||
} else {
|
||||
elog(DEBUG1, "inmem_write() called for %u/%u/%u.%u blk %u: found at %u",
|
||||
reln->smgr_rnode.node.spcNode,
|
||||
reln->smgr_rnode.node.dbNode,
|
||||
reln->smgr_rnode.node.relNode,
|
||||
forknum,
|
||||
blocknum,
|
||||
used_pages);
|
||||
}
|
||||
memcpy(page_body[pg], buffer, BLCKSZ);
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_nblocks() -- Get the number of blocks stored in a relation.
|
||||
*/
|
||||
BlockNumber
|
||||
inmem_nblocks(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
/*
|
||||
* It's not clear why a WAL redo function would call smgrnblocks().
|
||||
* During recovery, at least before reaching consistency, the size of a
|
||||
* relation could be arbitrarily small, if it was truncated after the
|
||||
* record being replayed, or arbitrarily large if it was extended
|
||||
* afterwards. But one place where it's called is in
|
||||
* XLogReadBufferExtended(): it extends the relation, if it's smaller than
|
||||
* the requested page. That's a waste of time in the WAL redo
|
||||
* process. Pretend that all relations are maximally sized to avoid it.
|
||||
*/
|
||||
return MaxBlockNumber;
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_truncate() -- Truncate relation to specified number of blocks.
|
||||
*/
|
||||
void
|
||||
inmem_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* inmem_immedsync() -- Immediately sync a relation to stable storage.
|
||||
*/
|
||||
void
|
||||
inmem_immedsync(SMgrRelation reln, ForkNumber forknum)
|
||||
{
|
||||
}
|
||||
|
||||
static const struct f_smgr inmem_smgr =
|
||||
{
|
||||
.smgr_init = inmem_init,
|
||||
.smgr_shutdown = NULL,
|
||||
.smgr_open = inmem_open,
|
||||
.smgr_close = inmem_close,
|
||||
.smgr_create = inmem_create,
|
||||
.smgr_exists = inmem_exists,
|
||||
.smgr_unlink = inmem_unlink,
|
||||
.smgr_extend = inmem_extend,
|
||||
.smgr_prefetch = inmem_prefetch,
|
||||
.smgr_read = inmem_read,
|
||||
.smgr_write = inmem_write,
|
||||
.smgr_writeback = inmem_writeback,
|
||||
.smgr_nblocks = inmem_nblocks,
|
||||
.smgr_truncate = inmem_truncate,
|
||||
.smgr_immedsync = inmem_immedsync,
|
||||
};
|
||||
|
||||
const f_smgr *
|
||||
smgr_inmem(BackendId backend, RelFileNode rnode)
|
||||
{
|
||||
Assert(InRecovery);
|
||||
if (backend != InvalidBackendId)
|
||||
return smgr_standard(backend, rnode);
|
||||
else
|
||||
return &inmem_smgr;
|
||||
}
|
||||
|
||||
void
|
||||
smgr_init_inmem()
|
||||
{
|
||||
inmem_init();
|
||||
}
|
||||
432
pgxn/neon/libpagestore.c
Normal file
432
pgxn/neon/libpagestore.c
Normal file
@@ -0,0 +1,432 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* libpagestore.c
|
||||
* Handles network communications with the remote pagestore.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/libpqpagestore.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
#include "fmgr.h"
|
||||
#include "access/xlog.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "libpq/libpq.h"
|
||||
|
||||
#include "miscadmin.h"
|
||||
#include "pgstat.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
#include "walproposer_utils.h"
|
||||
|
||||
|
||||
#define PageStoreTrace DEBUG5
|
||||
|
||||
#define NEON_TAG "[NEON_SMGR] "
|
||||
#define neon_log(tag, fmt, ...) ereport(tag, \
|
||||
(errmsg(NEON_TAG fmt, ## __VA_ARGS__), \
|
||||
errhidestmt(true), errhidecontext(true)))
|
||||
|
||||
bool connected = false;
|
||||
PGconn *pageserver_conn = NULL;
|
||||
|
||||
char *page_server_connstring_raw;
|
||||
|
||||
static ZenithResponse *pageserver_call(ZenithRequest *request);
|
||||
page_server_api api = {
|
||||
.request = pageserver_call
|
||||
};
|
||||
|
||||
static void
|
||||
pageserver_connect()
|
||||
{
|
||||
char *query;
|
||||
int ret;
|
||||
|
||||
Assert(!connected);
|
||||
|
||||
pageserver_conn = PQconnectdb(page_server_connstring);
|
||||
|
||||
if (PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
|
||||
errmsg(NEON_TAG "could not establish connection to pageserver"),
|
||||
errdetail_internal("%s", msg)));
|
||||
}
|
||||
|
||||
query = psprintf("pagestream %s %s", zenith_tenant, zenith_timeline);
|
||||
ret = PQsendQuery(pageserver_conn, query);
|
||||
if (ret != 1)
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
neon_log(ERROR, "could not send pagestream command to pageserver");
|
||||
}
|
||||
|
||||
while (PQisBusy(pageserver_conn))
|
||||
{
|
||||
int wc;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
wc = WaitLatchOrSocket(MyLatch,
|
||||
WL_LATCH_SET | WL_SOCKET_READABLE |
|
||||
WL_EXIT_ON_PM_DEATH,
|
||||
PQsocket(pageserver_conn),
|
||||
-1L, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (wc & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(pageserver_conn))
|
||||
{
|
||||
char *msg = pchomp(PQerrorMessage(pageserver_conn));
|
||||
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
|
||||
neon_log(ERROR, "could not complete handshake with pageserver: %s",
|
||||
msg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring_raw);
|
||||
|
||||
connected = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* A wrapper around PQgetCopyData that checks for interrupts while sleeping.
|
||||
*/
|
||||
static int
|
||||
call_PQgetCopyData(PGconn *conn, char **buffer)
|
||||
{
|
||||
int ret;
|
||||
|
||||
retry:
|
||||
ret = PQgetCopyData(conn, buffer, 1 /* async */ );
|
||||
|
||||
if (ret == 0)
|
||||
{
|
||||
int wc;
|
||||
|
||||
/* Sleep until there's something to do */
|
||||
wc = WaitLatchOrSocket(MyLatch,
|
||||
WL_LATCH_SET | WL_SOCKET_READABLE |
|
||||
WL_EXIT_ON_PM_DEATH,
|
||||
PQsocket(conn),
|
||||
-1L, PG_WAIT_EXTENSION);
|
||||
ResetLatch(MyLatch);
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/* Data available in socket? */
|
||||
if (wc & WL_SOCKET_READABLE)
|
||||
{
|
||||
if (!PQconsumeInput(conn))
|
||||
neon_log(ERROR, "could not get response from pageserver: %s",
|
||||
PQerrorMessage(conn));
|
||||
}
|
||||
|
||||
goto retry;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
static ZenithResponse *
|
||||
pageserver_call(ZenithRequest *request)
|
||||
{
|
||||
StringInfoData req_buff;
|
||||
StringInfoData resp_buff;
|
||||
ZenithResponse *resp;
|
||||
|
||||
PG_TRY();
|
||||
{
|
||||
/* If the connection was lost for some reason, reconnect */
|
||||
if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
|
||||
{
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
connected = false;
|
||||
}
|
||||
|
||||
if (!connected)
|
||||
pageserver_connect();
|
||||
|
||||
req_buff = zm_pack_request(request);
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
*
|
||||
* In principle, this could block if the output buffer is full, and we
|
||||
* should use async mode and check for interrupts while waiting. In
|
||||
* practice, our requests are small enough to always fit in the output
|
||||
* and TCP buffer.
|
||||
*/
|
||||
if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0 || PQflush(pageserver_conn))
|
||||
{
|
||||
neon_log(ERROR, "failed to send page request: %s",
|
||||
PQerrorMessage(pageserver_conn));
|
||||
}
|
||||
pfree(req_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = zm_to_string((ZenithMessage *) request);
|
||||
|
||||
neon_log(PageStoreTrace, "sent request: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
|
||||
/* read response */
|
||||
resp_buff.len = call_PQgetCopyData(pageserver_conn, &resp_buff.data);
|
||||
resp_buff.cursor = 0;
|
||||
|
||||
if (resp_buff.len == -1)
|
||||
neon_log(ERROR, "end of COPY");
|
||||
else if (resp_buff.len == -2)
|
||||
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
|
||||
|
||||
resp = zm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = zm_to_string((ZenithMessage *) resp);
|
||||
|
||||
neon_log(PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
}
|
||||
}
|
||||
PG_CATCH();
|
||||
{
|
||||
/*
|
||||
* If anything goes wrong while we were sending a request, it's not
|
||||
* clear what state the connection is in. For example, if we sent the
|
||||
* request but didn't receive a response yet, we might receive the
|
||||
* response some time later after we have already sent a new unrelated
|
||||
* request. Close the connection to avoid getting confused.
|
||||
*/
|
||||
if (connected)
|
||||
{
|
||||
neon_log(LOG, "dropping connection to page server due to error");
|
||||
PQfinish(pageserver_conn);
|
||||
pageserver_conn = NULL;
|
||||
connected = false;
|
||||
}
|
||||
PG_RE_THROW();
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
return (ZenithResponse *) resp;
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
check_zenith_id(char **newval, void **extra, GucSource source)
|
||||
{
|
||||
uint8 zid[16];
|
||||
|
||||
return **newval == '\0' || HexDecodeString(zid, *newval, 16);
|
||||
}
|
||||
|
||||
static char *
|
||||
substitute_pageserver_password(const char *page_server_connstring_raw)
|
||||
{
|
||||
char *host = NULL;
|
||||
char *port = NULL;
|
||||
char *user = NULL;
|
||||
char *auth_token = NULL;
|
||||
char *err = NULL;
|
||||
char *page_server_connstring = NULL;
|
||||
PQconninfoOption *conn_options;
|
||||
PQconninfoOption *conn_option;
|
||||
MemoryContext oldcontext;
|
||||
|
||||
/*
|
||||
* Here we substitute password in connection string with an environment
|
||||
* variable. To simplify things we construct a connection string back with
|
||||
* only known options. In particular: host port user and password. We do
|
||||
* not currently use other options and constructing full connstring in an
|
||||
* URI shape is quite messy.
|
||||
*/
|
||||
|
||||
if (page_server_connstring_raw == NULL || page_server_connstring_raw[0] == '\0')
|
||||
return NULL;
|
||||
|
||||
/* extract the auth token from the connection string */
|
||||
conn_options = PQconninfoParse(page_server_connstring_raw, &err);
|
||||
if (conn_options == NULL)
|
||||
{
|
||||
/* The error string is malloc'd, so we must free it explicitly */
|
||||
char *errcopy = err ? pstrdup(err) : "out of memory";
|
||||
|
||||
PQfreemem(err);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid connection string syntax: %s", errcopy)));
|
||||
}
|
||||
|
||||
/*
|
||||
* Trying to populate pageserver connection string with auth token from
|
||||
* environment. We are looking for password in with placeholder value like
|
||||
* $ENV_VAR_NAME, so if password field is present and starts with $ we try
|
||||
* to fetch environment variable value and fail loudly if it is not set.
|
||||
*/
|
||||
for (conn_option = conn_options; conn_option->keyword != NULL; conn_option++)
|
||||
{
|
||||
if (strcmp(conn_option->keyword, "host") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
host = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "port") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
port = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "user") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
user = conn_option->val;
|
||||
}
|
||||
else if (strcmp(conn_option->keyword, "password") == 0)
|
||||
{
|
||||
if (conn_option->val != NULL && conn_option->val[0] != '\0')
|
||||
{
|
||||
/* ensure that this is a template */
|
||||
if (strncmp(conn_option->val, "$", 1) != 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("expected placeholder value in pageserver password starting from $ but found: %s", &conn_option->val[1])));
|
||||
|
||||
neon_log(LOG, "found auth token placeholder in pageserver conn string '%s'", &conn_option->val[1]);
|
||||
auth_token = getenv(&conn_option->val[1]);
|
||||
if (!auth_token)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
||||
errmsg("cannot get auth token, environment variable %s is not set", &conn_option->val[1])));
|
||||
}
|
||||
else
|
||||
{
|
||||
neon_log(LOG, "using auth token from environment passed via env");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* allocate connection string in TopMemoryContext to make sure it is not
|
||||
* freed
|
||||
*/
|
||||
oldcontext = CurrentMemoryContext;
|
||||
MemoryContextSwitchTo(TopMemoryContext);
|
||||
page_server_connstring = psprintf("postgresql://%s:%s@%s:%s", user, auth_token ? auth_token : "", host, port);
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
|
||||
PQconninfoFree(conn_options);
|
||||
return page_server_connstring;
|
||||
}
|
||||
|
||||
/*
|
||||
* Module initialization function
|
||||
*/
|
||||
void
|
||||
pg_init_libpagestore(void)
|
||||
{
|
||||
DefineCustomStringVariable("neon.pageserver_connstring",
|
||||
"connection string to the page server",
|
||||
NULL,
|
||||
&page_server_connstring_raw,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.timeline_id",
|
||||
"Zenith timelineid the server is running on",
|
||||
NULL,
|
||||
&zenith_timeline,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
check_zenith_id, NULL, NULL);
|
||||
|
||||
DefineCustomStringVariable("neon.tenant_id",
|
||||
"Neon tenantid the server is running on",
|
||||
NULL,
|
||||
&zenith_tenant,
|
||||
"",
|
||||
PGC_POSTMASTER,
|
||||
0, /* no flags required */
|
||||
check_zenith_id, NULL, NULL);
|
||||
|
||||
DefineCustomBoolVariable("neon.wal_redo",
|
||||
"start in wal-redo mode",
|
||||
NULL,
|
||||
&wal_redo,
|
||||
false,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
DefineCustomIntVariable("neon.max_cluster_size",
|
||||
"cluster size limit",
|
||||
NULL,
|
||||
&max_cluster_size,
|
||||
-1, -1, INT_MAX,
|
||||
PGC_SIGHUP,
|
||||
GUC_UNIT_MB,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
relsize_hash_init();
|
||||
|
||||
if (page_server != NULL)
|
||||
neon_log(ERROR, "libpagestore already loaded");
|
||||
|
||||
neon_log(PageStoreTrace, "libpagestore already loaded");
|
||||
page_server = &api;
|
||||
|
||||
/* substitute password in pageserver_connstring */
|
||||
page_server_connstring = substitute_pageserver_password(page_server_connstring_raw);
|
||||
|
||||
/* Is there more correct way to pass CustomGUC to postgres code? */
|
||||
zenith_timeline_walproposer = zenith_timeline;
|
||||
zenith_tenant_walproposer = zenith_tenant;
|
||||
|
||||
if (wal_redo)
|
||||
{
|
||||
neon_log(PageStoreTrace, "set inmem_smgr hook");
|
||||
smgr_hook = smgr_inmem;
|
||||
smgr_init_hook = smgr_init_inmem;
|
||||
}
|
||||
else if (page_server_connstring && page_server_connstring[0])
|
||||
{
|
||||
neon_log(PageStoreTrace, "set neon_smgr hook");
|
||||
smgr_hook = smgr_zenith;
|
||||
smgr_init_hook = smgr_init_zenith;
|
||||
dbsize_hook = zenith_dbsize;
|
||||
}
|
||||
}
|
||||
413
pgxn/neon/libpqwalproposer.c
Normal file
413
pgxn/neon/libpqwalproposer.c
Normal file
@@ -0,0 +1,413 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "libpq-fe.h"
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
|
||||
struct WalProposerConn
|
||||
{
|
||||
PGconn* pg_conn;
|
||||
bool is_nonblocking; /* whether the connection is non-blocking */
|
||||
char *recvbuf; /* last received data from libpqprop_async_read */
|
||||
};
|
||||
|
||||
/* Prototypes for exported functions */
|
||||
static char* libpqprop_error_message(WalProposerConn* conn);
|
||||
static WalProposerConnStatusType libpqprop_status(WalProposerConn* conn);
|
||||
static WalProposerConn* libpqprop_connect_start(char* conninfo);
|
||||
static WalProposerConnectPollStatusType libpqprop_connect_poll(WalProposerConn* conn);
|
||||
static bool libpqprop_send_query(WalProposerConn* conn, char* query);
|
||||
static WalProposerExecStatusType libpqprop_get_query_result(WalProposerConn* conn);
|
||||
static pgsocket libpqprop_socket(WalProposerConn* conn);
|
||||
static int libpqprop_flush(WalProposerConn* conn);
|
||||
static void libpqprop_finish(WalProposerConn* conn);
|
||||
static PGAsyncReadResult libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount);
|
||||
static PGAsyncWriteResult libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size);
|
||||
static bool libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size);
|
||||
|
||||
static WalProposerFunctionsType PQWalProposerFunctions = {
|
||||
libpqprop_error_message,
|
||||
libpqprop_status,
|
||||
libpqprop_connect_start,
|
||||
libpqprop_connect_poll,
|
||||
libpqprop_send_query,
|
||||
libpqprop_get_query_result,
|
||||
libpqprop_socket,
|
||||
libpqprop_flush,
|
||||
libpqprop_finish,
|
||||
libpqprop_async_read,
|
||||
libpqprop_async_write,
|
||||
libpqprop_blocking_write,
|
||||
};
|
||||
|
||||
/* Module initialization */
|
||||
void
|
||||
pg_init_libpqwalproposer(void)
|
||||
{
|
||||
if (WalProposerFunctions != NULL)
|
||||
elog(ERROR, "libpqwalproposer already loaded");
|
||||
WalProposerFunctions = &PQWalProposerFunctions;
|
||||
}
|
||||
|
||||
/* Helper function */
|
||||
static bool
|
||||
ensure_nonblocking_status(WalProposerConn* conn, bool is_nonblocking)
|
||||
{
|
||||
/* If we're already correctly blocking or nonblocking, all good */
|
||||
if (is_nonblocking == conn->is_nonblocking)
|
||||
return true;
|
||||
|
||||
/* Otherwise, set it appropriately */
|
||||
if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
|
||||
return false;
|
||||
|
||||
conn->is_nonblocking = is_nonblocking;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Exported function definitions */
|
||||
static char*
|
||||
libpqprop_error_message(WalProposerConn* conn)
|
||||
{
|
||||
return PQerrorMessage(conn->pg_conn);
|
||||
}
|
||||
|
||||
static WalProposerConnStatusType
|
||||
libpqprop_status(WalProposerConn* conn)
|
||||
{
|
||||
switch (PQstatus(conn->pg_conn))
|
||||
{
|
||||
case CONNECTION_OK:
|
||||
return WP_CONNECTION_OK;
|
||||
case CONNECTION_BAD:
|
||||
return WP_CONNECTION_BAD;
|
||||
default:
|
||||
return WP_CONNECTION_IN_PROGRESS;
|
||||
}
|
||||
}
|
||||
|
||||
static WalProposerConn*
|
||||
libpqprop_connect_start(char* conninfo)
|
||||
{
|
||||
WalProposerConn* conn;
|
||||
PGconn* pg_conn;
|
||||
|
||||
pg_conn = PQconnectStart(conninfo);
|
||||
/*
|
||||
* Allocation of a PQconn can fail, and will return NULL. We want to fully replicate the
|
||||
* behavior of PQconnectStart here.
|
||||
*/
|
||||
if (!pg_conn)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* And in theory this allocation can fail as well, but it's incredibly unlikely if we just
|
||||
* successfully allocated a PGconn.
|
||||
*
|
||||
* palloc will exit on failure though, so there's not much we could do if it *did* fail.
|
||||
*/
|
||||
conn = palloc(sizeof(WalProposerConn));
|
||||
conn->pg_conn = pg_conn;
|
||||
conn->is_nonblocking = false; /* connections always start in blocking mode */
|
||||
conn->recvbuf = NULL;
|
||||
return conn;
|
||||
}
|
||||
|
||||
static WalProposerConnectPollStatusType
|
||||
libpqprop_connect_poll(WalProposerConn* conn)
|
||||
{
|
||||
WalProposerConnectPollStatusType return_val;
|
||||
|
||||
switch (PQconnectPoll(conn->pg_conn))
|
||||
{
|
||||
case PGRES_POLLING_FAILED:
|
||||
return_val = WP_CONN_POLLING_FAILED;
|
||||
break;
|
||||
case PGRES_POLLING_READING:
|
||||
return_val = WP_CONN_POLLING_READING;
|
||||
break;
|
||||
case PGRES_POLLING_WRITING:
|
||||
return_val = WP_CONN_POLLING_WRITING;
|
||||
break;
|
||||
case PGRES_POLLING_OK:
|
||||
return_val = WP_CONN_POLLING_OK;
|
||||
break;
|
||||
|
||||
/* There's a comment at its source about this constant being unused. We'll expect it's never
|
||||
* returned. */
|
||||
case PGRES_POLLING_ACTIVE:
|
||||
elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
|
||||
/* This return is never actually reached, but it's here to make the compiler happy */
|
||||
return WP_CONN_POLLING_FAILED;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
return_val = WP_CONN_POLLING_FAILED; /* keep the compiler quiet */
|
||||
}
|
||||
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static bool
|
||||
libpqprop_send_query(WalProposerConn* conn, char* query)
|
||||
{
|
||||
/* We need to be in blocking mode for sending the query to run without
|
||||
* requiring a call to PQflush */
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
return false;
|
||||
|
||||
/* PQsendQuery returns 1 on success, 0 on failure */
|
||||
if (!PQsendQuery(conn->pg_conn, query))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static WalProposerExecStatusType
|
||||
libpqprop_get_query_result(WalProposerConn* conn)
|
||||
{
|
||||
PGresult* result;
|
||||
WalProposerExecStatusType return_val;
|
||||
|
||||
/* Marker variable if we need to log an unexpected success result */
|
||||
char* unexpected_success = NULL;
|
||||
|
||||
/* Consume any input that we might be missing */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
return WP_EXEC_FAILED;
|
||||
|
||||
if (PQisBusy(conn->pg_conn))
|
||||
return WP_EXEC_NEEDS_INPUT;
|
||||
|
||||
|
||||
result = PQgetResult(conn->pg_conn);
|
||||
/* PQgetResult returns NULL only if getting the result was successful & there's no more of the
|
||||
* result to get. */
|
||||
if (!result)
|
||||
{
|
||||
elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
|
||||
return WP_EXEC_UNEXPECTED_SUCCESS;
|
||||
}
|
||||
|
||||
/* Helper macro to reduce boilerplate */
|
||||
#define UNEXPECTED_SUCCESS(msg) \
|
||||
return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
|
||||
unexpected_success = msg; \
|
||||
break;
|
||||
|
||||
|
||||
switch (PQresultStatus(result))
|
||||
{
|
||||
/* "true" success case */
|
||||
case PGRES_COPY_BOTH:
|
||||
return_val = WP_EXEC_SUCCESS_COPYBOTH;
|
||||
break;
|
||||
|
||||
/* Unexpected success case */
|
||||
case PGRES_EMPTY_QUERY:
|
||||
UNEXPECTED_SUCCESS("empty query return");
|
||||
case PGRES_COMMAND_OK:
|
||||
UNEXPECTED_SUCCESS("data-less command end");
|
||||
case PGRES_TUPLES_OK:
|
||||
UNEXPECTED_SUCCESS("tuples return");
|
||||
case PGRES_COPY_OUT:
|
||||
UNEXPECTED_SUCCESS("'Copy Out' response");
|
||||
case PGRES_COPY_IN:
|
||||
UNEXPECTED_SUCCESS("'Copy In' response");
|
||||
case PGRES_SINGLE_TUPLE:
|
||||
UNEXPECTED_SUCCESS("single tuple return");
|
||||
case PGRES_PIPELINE_SYNC:
|
||||
UNEXPECTED_SUCCESS("pipeline sync point");
|
||||
|
||||
/* Failure cases */
|
||||
case PGRES_BAD_RESPONSE:
|
||||
case PGRES_NONFATAL_ERROR:
|
||||
case PGRES_FATAL_ERROR:
|
||||
case PGRES_PIPELINE_ABORTED:
|
||||
return_val = WP_EXEC_FAILED;
|
||||
break;
|
||||
|
||||
default:
|
||||
Assert(false);
|
||||
return_val = WP_EXEC_FAILED; /* keep the compiler quiet */
|
||||
}
|
||||
|
||||
if (unexpected_success)
|
||||
elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
|
||||
|
||||
return return_val;
|
||||
}
|
||||
|
||||
static pgsocket
|
||||
libpqprop_socket(WalProposerConn* conn)
|
||||
{
|
||||
return PQsocket(conn->pg_conn);
|
||||
}
|
||||
|
||||
static int
|
||||
libpqprop_flush(WalProposerConn* conn)
|
||||
{
|
||||
return (PQflush(conn->pg_conn));
|
||||
}
|
||||
|
||||
static void
|
||||
libpqprop_finish(WalProposerConn* conn)
|
||||
{
|
||||
if (conn->recvbuf != NULL)
|
||||
PQfreemem(conn->recvbuf);
|
||||
PQfinish(conn->pg_conn);
|
||||
pfree(conn);
|
||||
}
|
||||
|
||||
/*
|
||||
* Receive a message from the safekeeper.
|
||||
*
|
||||
* On success, the data is placed in *buf. It is valid until the next call
|
||||
* to this function.
|
||||
*/
|
||||
static PGAsyncReadResult
|
||||
libpqprop_async_read(WalProposerConn* conn, char** buf, int* amount)
|
||||
{
|
||||
int result;
|
||||
|
||||
if (conn->recvbuf != NULL)
|
||||
{
|
||||
PQfreemem(conn->recvbuf);
|
||||
conn->recvbuf = NULL;
|
||||
}
|
||||
|
||||
/* Call PQconsumeInput so that we have the data we need */
|
||||
if (!PQconsumeInput(conn->pg_conn))
|
||||
{
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
}
|
||||
|
||||
/* The docs for PQgetCopyData list the return values as:
|
||||
* 0 if the copy is still in progress, but no "complete row" is
|
||||
* available
|
||||
* -1 if the copy is done
|
||||
* -2 if an error occured
|
||||
* (> 0) if it was successful; that value is the amount transferred.
|
||||
*
|
||||
* The protocol we use between walproposer and safekeeper means that we
|
||||
* *usually* wouldn't expect to see that the copy is done, but this can
|
||||
* sometimes be triggered by the server returning an ErrorResponse (which
|
||||
* also happens to have the effect that the copy is done).
|
||||
*/
|
||||
switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
|
||||
{
|
||||
case 0:
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_TRY_AGAIN;
|
||||
case -1:
|
||||
{
|
||||
/*
|
||||
* If we get -1, it's probably because of a server error; the
|
||||
* safekeeper won't normally send a CopyDone message.
|
||||
*
|
||||
* We can check PQgetResult to make sure that the server failed;
|
||||
* it'll always result in PGRES_FATAL_ERROR
|
||||
*/
|
||||
ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
|
||||
|
||||
if (status != PGRES_FATAL_ERROR)
|
||||
elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
|
||||
|
||||
/* If there was actually an error, it'll be properly reported by
|
||||
* calls to PQerrorMessage -- we don't have to do anything else */
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
}
|
||||
case -2:
|
||||
*amount = 0;
|
||||
*buf = NULL;
|
||||
return PG_ASYNC_READ_FAIL;
|
||||
default:
|
||||
/* Positive values indicate the size of the returned result */
|
||||
*amount = result;
|
||||
*buf = conn->recvbuf;
|
||||
return PG_ASYNC_READ_SUCCESS;
|
||||
}
|
||||
}
|
||||
|
||||
static PGAsyncWriteResult
|
||||
libpqprop_async_write(WalProposerConn* conn, void const* buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we aren't in non-blocking mode, switch to it. */
|
||||
if (!ensure_nonblocking_status(conn, true))
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
|
||||
/* The docs for PQputcopyData list the return values as:
|
||||
* 1 if the data was queued,
|
||||
* 0 if it was not queued because of full buffers, or
|
||||
* -1 if an error occured
|
||||
*/
|
||||
result = PQputCopyData(conn->pg_conn, buf, size);
|
||||
|
||||
/* We won't get a result of zero because walproposer always empties the
|
||||
* connection's buffers before sending more */
|
||||
Assert(result != 0);
|
||||
|
||||
switch (result)
|
||||
{
|
||||
case 1:
|
||||
/* good -- continue */
|
||||
break;
|
||||
case -1:
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
default:
|
||||
elog(FATAL, "invalid return %d from PQputCopyData", result);
|
||||
}
|
||||
|
||||
/* After queueing the data, we still need to flush to get it to send.
|
||||
* This might take multiple tries, but we don't want to wait around
|
||||
* until it's done.
|
||||
*
|
||||
* PQflush has the following returns (directly quoting the docs):
|
||||
* 0 if sucessful,
|
||||
* 1 if it was unable to send all the data in the send queue yet
|
||||
* -1 if it failed for some reason
|
||||
*/
|
||||
switch (result = PQflush(conn->pg_conn)) {
|
||||
case 0:
|
||||
return PG_ASYNC_WRITE_SUCCESS;
|
||||
case 1:
|
||||
return PG_ASYNC_WRITE_TRY_FLUSH;
|
||||
case -1:
|
||||
return PG_ASYNC_WRITE_FAIL;
|
||||
default:
|
||||
elog(FATAL, "invalid return %d from PQflush", result);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
libpqprop_blocking_write(WalProposerConn* conn, void const* buf, size_t size)
|
||||
{
|
||||
int result;
|
||||
|
||||
/* If we are in non-blocking mode, switch out of it. */
|
||||
if (!ensure_nonblocking_status(conn, false))
|
||||
return false;
|
||||
|
||||
/* Ths function is very similar to libpqprop_async_write. For more
|
||||
* information, refer to the comments there */
|
||||
if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 1);
|
||||
|
||||
/* Because the connection is non-blocking, flushing returns 0 or -1 */
|
||||
|
||||
if ((result = PQflush(conn->pg_conn)) == -1)
|
||||
return false;
|
||||
|
||||
Assert(result == 0);
|
||||
return true;
|
||||
}
|
||||
17
pgxn/neon/neon--1.0.sql
Normal file
17
pgxn/neon/neon--1.0.sql
Normal file
@@ -0,0 +1,17 @@
|
||||
\echo Use "CREATE EXTENSION neon" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION pg_cluster_size()
|
||||
RETURNS bigint
|
||||
AS 'MODULE_PATHNAME', 'pg_cluster_size'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION backpressure_lsns(
|
||||
OUT received_lsn pg_lsn,
|
||||
OUT disk_consistent_lsn pg_lsn,
|
||||
OUT remote_consistent_lsn pg_lsn
|
||||
)
|
||||
RETURNS record
|
||||
AS 'MODULE_PATHNAME', 'backpressure_lsns'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
82
pgxn/neon/neon.c
Normal file
82
pgxn/neon/neon.c
Normal file
@@ -0,0 +1,82 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neon.c
|
||||
* Utility functions to expose neon specific information to user
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/neon.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "replication/walsender.h"
|
||||
#include "funcapi.h"
|
||||
#include "access/htup_details.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
#include "neon.h"
|
||||
#include "walproposer.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
void _PG_init(void);
|
||||
|
||||
|
||||
void _PG_init(void)
|
||||
{
|
||||
pg_init_libpagestore();
|
||||
pg_init_libpqwalproposer();
|
||||
pg_init_walproposer();
|
||||
|
||||
EmitWarningsOnPlaceholders("neon");
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(pg_cluster_size);
|
||||
PG_FUNCTION_INFO_V1(backpressure_lsns);
|
||||
|
||||
Datum
|
||||
pg_cluster_size(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int64 size;
|
||||
|
||||
size = GetZenithCurrentClusterSize();
|
||||
|
||||
if (size == 0)
|
||||
PG_RETURN_NULL();
|
||||
|
||||
PG_RETURN_INT64(size);
|
||||
}
|
||||
|
||||
|
||||
Datum
|
||||
backpressure_lsns(PG_FUNCTION_ARGS)
|
||||
{
|
||||
XLogRecPtr writePtr;
|
||||
XLogRecPtr flushPtr;
|
||||
XLogRecPtr applyPtr;
|
||||
Datum values[3];
|
||||
bool nulls[3];
|
||||
TupleDesc tupdesc;
|
||||
|
||||
replication_feedback_get_lsns(&writePtr, &flushPtr, &applyPtr);
|
||||
|
||||
tupdesc = CreateTemplateTupleDesc(3);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "received_lsn", PG_LSNOID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "disk_consistent_lsn", PG_LSNOID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "remote_consistent_lsn", PG_LSNOID, -1, 0);
|
||||
tupdesc = BlessTupleDesc(tupdesc);
|
||||
|
||||
MemSet(nulls, 0, sizeof(nulls));
|
||||
values[0] = LSNGetDatum(writePtr);
|
||||
values[1] = LSNGetDatum(flushPtr);
|
||||
values[2] = LSNGetDatum(applyPtr);
|
||||
|
||||
PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
|
||||
}
|
||||
4
pgxn/neon/neon.control
Normal file
4
pgxn/neon/neon.control
Normal file
@@ -0,0 +1,4 @@
|
||||
# neon extension
|
||||
comment = 'cloud storage for PostgreSQL'
|
||||
default_version = '1.0'
|
||||
module_pathname = '$libdir/neon'
|
||||
19
pgxn/neon/neon.h
Normal file
19
pgxn/neon/neon.h
Normal file
@@ -0,0 +1,19 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neon.h
|
||||
* Functions used in the initialization of this extension.
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/neon.h
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#ifndef NEON_H
|
||||
#define NEON_H
|
||||
|
||||
extern void pg_init_libpagestore(void);
|
||||
extern void pg_init_libpqwalproposer(void);
|
||||
extern void pg_init_walproposer(void);
|
||||
|
||||
#endif /* NEON_H */
|
||||
221
pgxn/neon/pagestore_client.h
Normal file
221
pgxn/neon/pagestore_client.h
Normal file
@@ -0,0 +1,221 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* pagestore_client.h
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* contrib/neon/pagestore_client.h
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#ifndef pageserver_h
|
||||
#define pageserver_h
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
#include "storage/relfilenode.h"
|
||||
#include "storage/block.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "lib/stringinfo.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "utils/memutils.h"
|
||||
|
||||
#include "pg_config.h"
|
||||
|
||||
typedef enum
|
||||
{
|
||||
/* pagestore_client -> pagestore */
|
||||
T_ZenithExistsRequest = 0,
|
||||
T_ZenithNblocksRequest,
|
||||
T_ZenithGetPageRequest,
|
||||
T_ZenithDbSizeRequest,
|
||||
|
||||
/* pagestore -> pagestore_client */
|
||||
T_ZenithExistsResponse = 100,
|
||||
T_ZenithNblocksResponse,
|
||||
T_ZenithGetPageResponse,
|
||||
T_ZenithErrorResponse,
|
||||
T_ZenithDbSizeResponse,
|
||||
} ZenithMessageTag;
|
||||
|
||||
|
||||
|
||||
/* base struct for c-style inheritance */
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
} ZenithMessage;
|
||||
|
||||
#define messageTag(m) (((const ZenithMessage *)(m))->tag)
|
||||
|
||||
/*
|
||||
* supertype of all the Zenith*Request structs below
|
||||
*
|
||||
* If 'latest' is true, we are requesting the latest page version, and 'lsn'
|
||||
* is just a hint to the server that we know there are no versions of the page
|
||||
* (or relation size, for exists/nblocks requests) later than the 'lsn'.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
bool latest; /* if true, request latest page version */
|
||||
XLogRecPtr lsn; /* request page version @ this LSN */
|
||||
} ZenithRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithRequest req;
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
} ZenithExistsRequest;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithRequest req;
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
} ZenithNblocksRequest;
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithRequest req;
|
||||
Oid dbNode;
|
||||
} ZenithDbSizeRequest;
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithRequest req;
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blkno;
|
||||
} ZenithGetPageRequest;
|
||||
|
||||
/* supertype of all the Zenith*Response structs below */
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
} ZenithResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
bool exists;
|
||||
} ZenithExistsResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
uint32 n_blocks;
|
||||
} ZenithNblocksResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
char page[FLEXIBLE_ARRAY_MEMBER];
|
||||
} ZenithGetPageResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
int64 db_size;
|
||||
} ZenithDbSizeResponse;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithMessageTag tag;
|
||||
char message[FLEXIBLE_ARRAY_MEMBER]; /* null-terminated error message */
|
||||
} ZenithErrorResponse;
|
||||
|
||||
extern StringInfoData zm_pack_request(ZenithRequest *msg);
|
||||
extern ZenithResponse *zm_unpack_response(StringInfo s);
|
||||
extern char *zm_to_string(ZenithMessage *msg);
|
||||
|
||||
/*
|
||||
* API
|
||||
*/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
ZenithResponse *(*request) (ZenithRequest *request);
|
||||
} page_server_api;
|
||||
|
||||
extern page_server_api *page_server;
|
||||
|
||||
extern char *page_server_connstring;
|
||||
extern char *zenith_timeline;
|
||||
extern char *zenith_tenant;
|
||||
extern bool wal_redo;
|
||||
extern int32 max_cluster_size;
|
||||
|
||||
extern const f_smgr *smgr_zenith(BackendId backend, RelFileNode rnode);
|
||||
extern void smgr_init_zenith(void);
|
||||
|
||||
extern const f_smgr *smgr_inmem(BackendId backend, RelFileNode rnode);
|
||||
extern void smgr_init_inmem(void);
|
||||
extern void smgr_shutdown_inmem(void);
|
||||
|
||||
/* zenith storage manager functionality */
|
||||
|
||||
extern void zenith_init(void);
|
||||
extern void zenith_open(SMgrRelation reln);
|
||||
extern void zenith_close(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void zenith_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
|
||||
extern bool zenith_exists(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void zenith_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
|
||||
extern void zenith_extend(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
extern bool zenith_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum);
|
||||
extern void zenith_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
|
||||
extern void zenith_read_at_lsn(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
|
||||
extern void zenith_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
extern void zenith_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks);
|
||||
extern BlockNumber zenith_nblocks(SMgrRelation reln, ForkNumber forknum);
|
||||
extern const int64 zenith_dbsize(Oid dbNode);
|
||||
extern void zenith_truncate(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
extern void zenith_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
|
||||
/* zenith wal-redo storage manager functionality */
|
||||
|
||||
extern void inmem_init(void);
|
||||
extern void inmem_open(SMgrRelation reln);
|
||||
extern void inmem_close(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void inmem_create(SMgrRelation reln, ForkNumber forknum, bool isRedo);
|
||||
extern bool inmem_exists(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void inmem_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
|
||||
extern void inmem_extend(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
extern bool inmem_prefetch(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum);
|
||||
extern void inmem_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
||||
char *buffer);
|
||||
extern void inmem_write(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, char *buffer, bool skipFsync);
|
||||
extern void inmem_writeback(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber blocknum, BlockNumber nblocks);
|
||||
extern BlockNumber inmem_nblocks(SMgrRelation reln, ForkNumber forknum);
|
||||
extern void inmem_truncate(SMgrRelation reln, ForkNumber forknum,
|
||||
BlockNumber nblocks);
|
||||
extern void inmem_immedsync(SMgrRelation reln, ForkNumber forknum);
|
||||
|
||||
|
||||
/* utils for zenith relsize cache */
|
||||
extern void relsize_hash_init(void);
|
||||
extern bool get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber* size);
|
||||
extern void set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
|
||||
extern void update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size);
|
||||
extern void forget_cached_relsize(RelFileNode rnode, ForkNumber forknum);
|
||||
|
||||
#endif
|
||||
1696
pgxn/neon/pagestore_smgr.c
Normal file
1696
pgxn/neon/pagestore_smgr.c
Normal file
File diff suppressed because it is too large
Load Diff
167
pgxn/neon/relsize_cache.c
Normal file
167
pgxn/neon/relsize_cache.c
Normal file
@@ -0,0 +1,167 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* relsize_cache.c
|
||||
* Relation size cache for better zentih performance.
|
||||
*
|
||||
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon/relsize_cache.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "pagestore_client.h"
|
||||
#include "storage/relfilenode.h"
|
||||
#include "storage/smgr.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/ipc.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "catalog/pg_tablespace_d.h"
|
||||
#include "utils/dynahash.h"
|
||||
#include "utils/guc.h"
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
} RelTag;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
RelTag tag;
|
||||
BlockNumber size;
|
||||
} RelSizeEntry;
|
||||
|
||||
static HTAB *relsize_hash;
|
||||
static LWLockId relsize_lock;
|
||||
static int relsize_hash_size;
|
||||
static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
|
||||
|
||||
/*
|
||||
* Size of a cache entry is 20 bytes. So this default will take about 1.2 MB,
|
||||
* which seems reasonable.
|
||||
*/
|
||||
#define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
|
||||
|
||||
static void
|
||||
zenith_smgr_shmem_startup(void)
|
||||
{
|
||||
static HASHCTL info;
|
||||
|
||||
if (prev_shmem_startup_hook)
|
||||
prev_shmem_startup_hook();
|
||||
|
||||
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
|
||||
relsize_lock = (LWLockId) GetNamedLWLockTranche("neon_relsize");
|
||||
info.keysize = sizeof(RelTag);
|
||||
info.entrysize = sizeof(RelSizeEntry);
|
||||
relsize_hash = ShmemInitHash("neon_relsize",
|
||||
relsize_hash_size, relsize_hash_size,
|
||||
&info,
|
||||
HASH_ELEM | HASH_BLOBS);
|
||||
LWLockRelease(AddinShmemInitLock);
|
||||
}
|
||||
|
||||
bool
|
||||
get_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber *size)
|
||||
{
|
||||
bool found = false;
|
||||
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RelTag tag;
|
||||
RelSizeEntry *entry;
|
||||
|
||||
tag.rnode = rnode;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_SHARED);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_FIND, NULL);
|
||||
if (entry != NULL)
|
||||
{
|
||||
*size = entry->size;
|
||||
found = true;
|
||||
}
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
void
|
||||
set_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
|
||||
{
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RelTag tag;
|
||||
RelSizeEntry *entry;
|
||||
|
||||
tag.rnode = rnode;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_ENTER, NULL);
|
||||
entry->size = size;
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
update_cached_relsize(RelFileNode rnode, ForkNumber forknum, BlockNumber size)
|
||||
{
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RelTag tag;
|
||||
RelSizeEntry *entry;
|
||||
bool found;
|
||||
|
||||
tag.rnode = rnode;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
entry = hash_search(relsize_hash, &tag, HASH_ENTER, &found);
|
||||
if (!found || entry->size < size)
|
||||
entry->size = size;
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
forget_cached_relsize(RelFileNode rnode, ForkNumber forknum)
|
||||
{
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RelTag tag;
|
||||
|
||||
tag.rnode = rnode;
|
||||
tag.forknum = forknum;
|
||||
LWLockAcquire(relsize_lock, LW_EXCLUSIVE);
|
||||
hash_search(relsize_hash, &tag, HASH_REMOVE, NULL);
|
||||
LWLockRelease(relsize_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
relsize_hash_init(void)
|
||||
{
|
||||
DefineCustomIntVariable("neon.relsize_hash_size",
|
||||
"Sets the maximum number of cached relation sizes for neon",
|
||||
NULL,
|
||||
&relsize_hash_size,
|
||||
DEFAULT_RELSIZE_HASH_SIZE,
|
||||
0,
|
||||
INT_MAX,
|
||||
PGC_POSTMASTER,
|
||||
0,
|
||||
NULL, NULL, NULL);
|
||||
|
||||
if (relsize_hash_size > 0)
|
||||
{
|
||||
RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
|
||||
RequestNamedLWLockTranche("neon_relsize", 1);
|
||||
|
||||
prev_shmem_startup_hook = shmem_startup_hook;
|
||||
shmem_startup_hook = zenith_smgr_shmem_startup;
|
||||
}
|
||||
}
|
||||
2403
pgxn/neon/walproposer.c
Normal file
2403
pgxn/neon/walproposer.c
Normal file
File diff suppressed because it is too large
Load Diff
540
pgxn/neon/walproposer.h
Normal file
540
pgxn/neon/walproposer.h
Normal file
@@ -0,0 +1,540 @@
|
||||
#ifndef __NEON_WALPROPOSER_H__
|
||||
#define __NEON_WALPROPOSER_H__
|
||||
|
||||
#include "access/xlogdefs.h"
|
||||
#include "postgres.h"
|
||||
#include "port.h"
|
||||
#include "access/xlog_internal.h"
|
||||
#include "access/transam.h"
|
||||
#include "nodes/replnodes.h"
|
||||
#include "utils/uuid.h"
|
||||
#include "replication/walreceiver.h"
|
||||
|
||||
#define SK_MAGIC 0xCafeCeefu
|
||||
#define SK_PROTOCOL_VERSION 2
|
||||
|
||||
#define MAX_SAFEKEEPERS 32
|
||||
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16) /* max size of a single WAL message */
|
||||
#define XLOG_HDR_SIZE (1+8*3) /* 'w' + startPos + walEnd + timestamp */
|
||||
#define XLOG_HDR_START_POS 1 /* offset of start position in wal sender message header */
|
||||
#define XLOG_HDR_END_POS (1+8) /* offset of end position in wal sender message header */
|
||||
|
||||
/*
|
||||
* In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
|
||||
* because all WL_* events are given flags equal to some (1 << i), starting from i = 0
|
||||
*/
|
||||
#define WL_NO_EVENTS 0
|
||||
|
||||
extern char* wal_acceptors_list;
|
||||
extern int wal_acceptor_reconnect_timeout;
|
||||
extern int wal_acceptor_connect_timeout;
|
||||
extern bool am_wal_proposer;
|
||||
|
||||
struct WalProposerConn; /* Defined in libpqwalproposer */
|
||||
typedef struct WalProposerConn WalProposerConn;
|
||||
|
||||
struct WalMessage;
|
||||
typedef struct WalMessage WalMessage;
|
||||
|
||||
extern char *zenith_timeline_walproposer;
|
||||
extern char *zenith_tenant_walproposer;
|
||||
|
||||
/* Possible return values from ReadPGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The full read was successful. buf now points to the data */
|
||||
PG_ASYNC_READ_SUCCESS,
|
||||
/* The read is ongoing. Wait until the connection is read-ready, then try
|
||||
* again. */
|
||||
PG_ASYNC_READ_TRY_AGAIN,
|
||||
/* Reading failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_READ_FAIL,
|
||||
} PGAsyncReadResult;
|
||||
|
||||
/* Possible return values from WritePGAsync */
|
||||
typedef enum
|
||||
{
|
||||
/* The write fully completed */
|
||||
PG_ASYNC_WRITE_SUCCESS,
|
||||
/* The write started, but you'll need to call PQflush some more times
|
||||
* to finish it off. We just tried, so it's best to wait until the
|
||||
* connection is read- or write-ready to try again.
|
||||
*
|
||||
* If it becomes read-ready, call PQconsumeInput and flush again. If it
|
||||
* becomes write-ready, just call PQflush.
|
||||
*/
|
||||
PG_ASYNC_WRITE_TRY_FLUSH,
|
||||
/* Writing failed. Check PQerrorMessage(conn) */
|
||||
PG_ASYNC_WRITE_FAIL,
|
||||
} PGAsyncWriteResult;
|
||||
|
||||
/*
|
||||
* WAL safekeeper state, which is used to wait for some event.
|
||||
*
|
||||
* States are listed here in the order that they're executed.
|
||||
*
|
||||
* Most states, upon failure, will move back to SS_OFFLINE by calls to
|
||||
* ResetConnection or ShutdownConnection.
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
/*
|
||||
* Does not have an active connection and will stay that way until
|
||||
* further notice.
|
||||
*
|
||||
* Moves to SS_CONNECTING_WRITE by calls to ResetConnection.
|
||||
*/
|
||||
SS_OFFLINE,
|
||||
|
||||
/*
|
||||
* Connecting states. "_READ" waits for the socket to be available for
|
||||
* reading, "_WRITE" waits for writing. There's no difference in the code
|
||||
* they execute when polled, but we have this distinction in order to
|
||||
* recreate the event set in HackyRemoveWalProposerEvent.
|
||||
*
|
||||
* After the connection is made, "START_WAL_PUSH" query is sent.
|
||||
*/
|
||||
SS_CONNECTING_WRITE,
|
||||
SS_CONNECTING_READ,
|
||||
|
||||
/*
|
||||
* Waiting for the result of the "START_WAL_PUSH" command.
|
||||
*
|
||||
* After we get a successful result, sends handshake to safekeeper.
|
||||
*/
|
||||
SS_WAIT_EXEC_RESULT,
|
||||
|
||||
/*
|
||||
* Executing the receiving half of the handshake. After receiving, moves to
|
||||
* SS_VOTING.
|
||||
*/
|
||||
SS_HANDSHAKE_RECV,
|
||||
|
||||
/*
|
||||
* Waiting to participate in voting, but a quorum hasn't yet been reached.
|
||||
* This is an idle state - we do not expect AdvancePollState to be called.
|
||||
*
|
||||
* Moved externally by execution of SS_HANDSHAKE_RECV, when we received a
|
||||
* quorum of handshakes.
|
||||
*/
|
||||
SS_VOTING,
|
||||
|
||||
/*
|
||||
* Already sent voting information, waiting to receive confirmation from the
|
||||
* node. After receiving, moves to SS_IDLE, if the quorum isn't reached yet.
|
||||
*/
|
||||
SS_WAIT_VERDICT,
|
||||
|
||||
/* Need to flush ProposerElected message. */
|
||||
SS_SEND_ELECTED_FLUSH,
|
||||
|
||||
/*
|
||||
* Waiting for quorum to send WAL. Idle state. If the socket becomes
|
||||
* read-ready, the connection has been closed.
|
||||
*
|
||||
* Moves to SS_ACTIVE only by call to StartStreaming.
|
||||
*/
|
||||
SS_IDLE,
|
||||
|
||||
/*
|
||||
* Active phase, when we acquired quorum and have WAL to send or feedback
|
||||
* to read.
|
||||
*/
|
||||
SS_ACTIVE,
|
||||
} SafekeeperState;
|
||||
|
||||
/* Consensus logical timestamp. */
|
||||
typedef uint64 term_t;
|
||||
|
||||
/* neon storage node id */
|
||||
typedef uint64 NNodeId;
|
||||
|
||||
/*
|
||||
* Proposer <-> Acceptor messaging.
|
||||
*/
|
||||
|
||||
/* Initial Proposer -> Acceptor message */
|
||||
typedef struct ProposerGreeting
|
||||
{
|
||||
uint64 tag; /* message tag */
|
||||
uint32 protocolVersion; /* proposer-safekeeper protocol version */
|
||||
uint32 pgVersion;
|
||||
pg_uuid_t proposerId;
|
||||
uint64 systemId; /* Postgres system identifier */
|
||||
uint8 ztimelineid[16]; /* Zenith timeline id */
|
||||
uint8 ztenantid[16];
|
||||
TimeLineID timeline;
|
||||
uint32 walSegSize;
|
||||
} ProposerGreeting;
|
||||
|
||||
typedef struct AcceptorProposerMessage
|
||||
{
|
||||
uint64 tag;
|
||||
} AcceptorProposerMessage;
|
||||
|
||||
/*
|
||||
* Acceptor -> Proposer initial response: the highest term acceptor voted for.
|
||||
*/
|
||||
typedef struct AcceptorGreeting
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
term_t term;
|
||||
NNodeId nodeId;
|
||||
} AcceptorGreeting;
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor vote request.
|
||||
*/
|
||||
typedef struct VoteRequest
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} VoteRequest;
|
||||
|
||||
/* Element of term switching chain. */
|
||||
typedef struct TermSwitchEntry
|
||||
{
|
||||
term_t term;
|
||||
XLogRecPtr lsn;
|
||||
} TermSwitchEntry;
|
||||
|
||||
typedef struct TermHistory
|
||||
{
|
||||
uint32 n_entries;
|
||||
TermSwitchEntry *entries;
|
||||
} TermHistory;
|
||||
|
||||
/* Vote itself, sent from safekeeper to proposer */
|
||||
typedef struct VoteResponse {
|
||||
AcceptorProposerMessage apm;
|
||||
term_t term;
|
||||
uint64 voteGiven;
|
||||
/*
|
||||
* Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
* proposer to choose the most advanced one.
|
||||
*/
|
||||
XLogRecPtr flushLsn;
|
||||
XLogRecPtr truncateLsn; /* minimal LSN which may be needed for recovery of some safekeeper */
|
||||
TermHistory termHistory;
|
||||
XLogRecPtr timelineStartLsn; /* timeline globally starts at this LSN */
|
||||
} VoteResponse;
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor message announcing proposer is elected and communicating
|
||||
* epoch history to it.
|
||||
*/
|
||||
typedef struct ProposerElected
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term;
|
||||
/* proposer will send since this point */
|
||||
XLogRecPtr startStreamingAt;
|
||||
/* history of term switches up to this proposer */
|
||||
TermHistory *termHistory;
|
||||
/* timeline globally starts at this LSN */
|
||||
XLogRecPtr timelineStartLsn;
|
||||
} ProposerElected;
|
||||
|
||||
/*
|
||||
* Header of request with WAL message sent from proposer to safekeeper.
|
||||
*/
|
||||
typedef struct AppendRequestHeader
|
||||
{
|
||||
uint64 tag;
|
||||
term_t term; /* term of the proposer */
|
||||
/*
|
||||
* LSN since which current proposer appends WAL (begin_lsn of its first
|
||||
* record); determines epoch switch point.
|
||||
*/
|
||||
XLogRecPtr epochStartLsn;
|
||||
XLogRecPtr beginLsn; /* start position of message in WAL */
|
||||
XLogRecPtr endLsn; /* end position of message in WAL */
|
||||
XLogRecPtr commitLsn; /* LSN committed by quorum of safekeepers */
|
||||
/*
|
||||
* minimal LSN which may be needed for recovery of some safekeeper (end lsn
|
||||
* + 1 of last chunk streamed to everyone)
|
||||
*/
|
||||
XLogRecPtr truncateLsn;
|
||||
pg_uuid_t proposerId; /* for monitoring/debugging */
|
||||
} AppendRequestHeader;
|
||||
|
||||
/*
|
||||
* Hot standby feedback received from replica
|
||||
*/
|
||||
typedef struct HotStandbyFeedback
|
||||
{
|
||||
TimestampTz ts;
|
||||
FullTransactionId xmin;
|
||||
FullTransactionId catalog_xmin;
|
||||
} HotStandbyFeedback;
|
||||
|
||||
|
||||
typedef struct ReplicationFeedback
|
||||
{
|
||||
// current size of the timeline on pageserver
|
||||
uint64 currentClusterSize;
|
||||
// standby_status_update fields that safekeeper received from pageserver
|
||||
XLogRecPtr ps_writelsn;
|
||||
XLogRecPtr ps_flushlsn;
|
||||
XLogRecPtr ps_applylsn;
|
||||
TimestampTz ps_replytime;
|
||||
} ReplicationFeedback;
|
||||
|
||||
|
||||
typedef struct WalproposerShmemState
|
||||
{
|
||||
slock_t mutex;
|
||||
ReplicationFeedback feedback;
|
||||
term_t mineLastElectedTerm;
|
||||
} WalproposerShmemState;
|
||||
|
||||
/*
|
||||
* Report safekeeper state to proposer
|
||||
*/
|
||||
typedef struct AppendResponse
|
||||
{
|
||||
AcceptorProposerMessage apm;
|
||||
/*
|
||||
* Current term of the safekeeper; if it is higher than proposer's, the
|
||||
* compute is out of date.
|
||||
*/
|
||||
term_t term;
|
||||
// TODO: add comment
|
||||
XLogRecPtr flushLsn;
|
||||
// Safekeeper reports back his awareness about which WAL is committed, as
|
||||
// this is a criterion for walproposer --sync mode exit
|
||||
XLogRecPtr commitLsn;
|
||||
HotStandbyFeedback hs;
|
||||
// Feedback recieved from pageserver includes standby_status_update fields
|
||||
// and custom zenith feedback.
|
||||
// This part of the message is extensible.
|
||||
ReplicationFeedback rf;
|
||||
} AppendResponse;
|
||||
|
||||
// ReplicationFeedback is extensible part of the message that is parsed separately
|
||||
// Other fields are fixed part
|
||||
#define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)
|
||||
|
||||
|
||||
/*
|
||||
* Descriptor of safekeeper
|
||||
*/
|
||||
typedef struct Safekeeper
|
||||
{
|
||||
char const* host;
|
||||
char const* port;
|
||||
char conninfo[MAXCONNINFO]; /* connection info for connecting/reconnecting */
|
||||
|
||||
/*
|
||||
* postgres protocol connection to the WAL acceptor
|
||||
*
|
||||
* Equals NULL only when state = SS_OFFLINE. Nonblocking is set once we
|
||||
* reach SS_ACTIVE; not before.
|
||||
*/
|
||||
WalProposerConn* conn;
|
||||
/*
|
||||
* Temporary buffer for the message being sent to the safekeeper.
|
||||
*/
|
||||
StringInfoData outbuf;
|
||||
/*
|
||||
* WAL reader, allocated for each safekeeper.
|
||||
*/
|
||||
XLogReaderState* xlogreader;
|
||||
|
||||
/*
|
||||
* Streaming will start here; must be record boundary.
|
||||
*/
|
||||
XLogRecPtr startStreamingAt;
|
||||
|
||||
bool flushWrite; /* set to true if we need to call AsyncFlush, to flush pending messages */
|
||||
XLogRecPtr streamingAt; /* current streaming position */
|
||||
AppendRequestHeader appendRequest; /* request for sending to safekeeper */
|
||||
|
||||
int eventPos; /* position in wait event set. Equal to -1 if no event */
|
||||
SafekeeperState state; /* safekeeper state machine state */
|
||||
TimestampTz startedConnAt; /* when connection attempt started */
|
||||
AcceptorGreeting greetResponse; /* acceptor greeting */
|
||||
VoteResponse voteResponse; /* the vote */
|
||||
AppendResponse appendResponse; /* feedback for master */
|
||||
} Safekeeper;
|
||||
|
||||
|
||||
extern PGDLLIMPORT void WalProposerMain(Datum main_arg);
|
||||
void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
|
||||
void WalProposerPoll(void);
|
||||
void WalProposerRegister(void);
|
||||
void ParseReplicationFeedbackMessage(StringInfo reply_message,
|
||||
ReplicationFeedback *rf);
|
||||
extern void StartProposerReplication(StartReplicationCmd *cmd);
|
||||
|
||||
Size WalproposerShmemSize(void);
|
||||
bool WalproposerShmemInit(void);
|
||||
void replication_feedback_set(ReplicationFeedback *rf);
|
||||
void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
|
||||
|
||||
/* libpqwalproposer hooks & helper type */
|
||||
|
||||
/* Re-exported PostgresPollingStatusType */
|
||||
typedef enum
|
||||
{
|
||||
WP_CONN_POLLING_FAILED = 0,
|
||||
WP_CONN_POLLING_READING,
|
||||
WP_CONN_POLLING_WRITING,
|
||||
WP_CONN_POLLING_OK,
|
||||
/*
|
||||
* 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
|
||||
* We've removed it here to avoid clutter.
|
||||
*/
|
||||
} WalProposerConnectPollStatusType;
|
||||
|
||||
/* Re-exported and modified ExecStatusType */
|
||||
typedef enum
|
||||
{
|
||||
/* We received a single CopyBoth result */
|
||||
WP_EXEC_SUCCESS_COPYBOTH,
|
||||
/* Any success result other than a single CopyBoth was received. The specifics of the result
|
||||
* were already logged, but it may be useful to provide an error message indicating which
|
||||
* safekeeper messed up.
|
||||
*
|
||||
* Do not expect PQerrorMessage to be appropriately set. */
|
||||
WP_EXEC_UNEXPECTED_SUCCESS,
|
||||
/* No result available at this time. Wait until read-ready, then call again. Internally, this is
|
||||
* returned when PQisBusy indicates that PQgetResult would block. */
|
||||
WP_EXEC_NEEDS_INPUT,
|
||||
/* Catch-all failure. Check PQerrorMessage. */
|
||||
WP_EXEC_FAILED,
|
||||
} WalProposerExecStatusType;
|
||||
|
||||
/* Re-exported ConnStatusType */
|
||||
typedef enum
|
||||
{
|
||||
WP_CONNECTION_OK,
|
||||
WP_CONNECTION_BAD,
|
||||
|
||||
/*
|
||||
* The original ConnStatusType has many more tags, but requests that
|
||||
* they not be relied upon (except for displaying to the user). We
|
||||
* don't need that extra functionality, so we collect them into a
|
||||
* single tag here.
|
||||
*/
|
||||
WP_CONNECTION_IN_PROGRESS,
|
||||
} WalProposerConnStatusType;
|
||||
|
||||
/* Re-exported PQerrorMessage */
|
||||
typedef char* (*walprop_error_message_fn) (WalProposerConn* conn);
|
||||
|
||||
/* Re-exported PQstatus */
|
||||
typedef WalProposerConnStatusType (*walprop_status_fn) (WalProposerConn* conn);
|
||||
|
||||
/* Re-exported PQconnectStart */
|
||||
typedef WalProposerConn* (*walprop_connect_start_fn) (char* conninfo);
|
||||
|
||||
/* Re-exported PQconectPoll */
|
||||
typedef WalProposerConnectPollStatusType (*walprop_connect_poll_fn) (WalProposerConn* conn);
|
||||
|
||||
/* Blocking wrapper around PQsendQuery */
|
||||
typedef bool (*walprop_send_query_fn) (WalProposerConn* conn, char* query);
|
||||
|
||||
/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
|
||||
typedef WalProposerExecStatusType (*walprop_get_query_result_fn) (WalProposerConn* conn);
|
||||
|
||||
/* Re-exported PQsocket */
|
||||
typedef pgsocket (*walprop_socket_fn) (WalProposerConn* conn);
|
||||
|
||||
/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
|
||||
typedef int (*walprop_flush_fn) (WalProposerConn* conn);
|
||||
|
||||
/* Re-exported PQfinish */
|
||||
typedef void (*walprop_finish_fn) (WalProposerConn* conn);
|
||||
|
||||
/*
|
||||
* Ergonomic wrapper around PGgetCopyData
|
||||
*
|
||||
* Reads a CopyData block from a safekeeper, setting *amount to the number
|
||||
* of bytes returned.
|
||||
*
|
||||
* This function is allowed to assume certain properties specific to the
|
||||
* protocol with the safekeepers, so it should not be used as-is for any
|
||||
* other purpose.
|
||||
*
|
||||
* Note: If possible, using <AsyncRead> is generally preferred, because it
|
||||
* performs a bit of extra checking work that's always required and is normally
|
||||
* somewhat verbose.
|
||||
*/
|
||||
typedef PGAsyncReadResult (*walprop_async_read_fn) (WalProposerConn* conn,
|
||||
char** buf,
|
||||
int* amount);
|
||||
|
||||
/*
|
||||
* Ergonomic wrapper around PQputCopyData + PQflush
|
||||
*
|
||||
* Starts to write a CopyData block to a safekeeper.
|
||||
*
|
||||
* For information on the meaning of return codes, refer to PGAsyncWriteResult.
|
||||
*/
|
||||
typedef PGAsyncWriteResult (*walprop_async_write_fn) (WalProposerConn* conn,
|
||||
void const* buf,
|
||||
size_t size);
|
||||
|
||||
/*
|
||||
* Blocking equivalent to walprop_async_write_fn
|
||||
*
|
||||
* Returns 'true' if successful, 'false' on failure.
|
||||
*/
|
||||
typedef bool (*walprop_blocking_write_fn) (WalProposerConn* conn, void const* buf, size_t size);
|
||||
|
||||
/* All libpqwalproposer exported functions collected together. */
|
||||
typedef struct WalProposerFunctionsType
|
||||
{
|
||||
walprop_error_message_fn walprop_error_message;
|
||||
walprop_status_fn walprop_status;
|
||||
walprop_connect_start_fn walprop_connect_start;
|
||||
walprop_connect_poll_fn walprop_connect_poll;
|
||||
walprop_send_query_fn walprop_send_query;
|
||||
walprop_get_query_result_fn walprop_get_query_result;
|
||||
walprop_socket_fn walprop_socket;
|
||||
walprop_flush_fn walprop_flush;
|
||||
walprop_finish_fn walprop_finish;
|
||||
walprop_async_read_fn walprop_async_read;
|
||||
walprop_async_write_fn walprop_async_write;
|
||||
walprop_blocking_write_fn walprop_blocking_write;
|
||||
} WalProposerFunctionsType;
|
||||
|
||||
/* Allow the above functions to be "called" with normal syntax */
|
||||
#define walprop_error_message(conn) \
|
||||
WalProposerFunctions->walprop_error_message(conn)
|
||||
#define walprop_status(conn) \
|
||||
WalProposerFunctions->walprop_status(conn)
|
||||
#define walprop_connect_start(conninfo) \
|
||||
WalProposerFunctions->walprop_connect_start(conninfo)
|
||||
#define walprop_connect_poll(conn) \
|
||||
WalProposerFunctions->walprop_connect_poll(conn)
|
||||
#define walprop_send_query(conn, query) \
|
||||
WalProposerFunctions->walprop_send_query(conn, query)
|
||||
#define walprop_get_query_result(conn) \
|
||||
WalProposerFunctions->walprop_get_query_result(conn)
|
||||
#define walprop_set_nonblocking(conn, arg) \
|
||||
WalProposerFunctions->walprop_set_nonblocking(conn, arg)
|
||||
#define walprop_socket(conn) \
|
||||
WalProposerFunctions->walprop_socket(conn)
|
||||
#define walprop_flush(conn) \
|
||||
WalProposerFunctions->walprop_flush(conn)
|
||||
#define walprop_finish(conn) \
|
||||
WalProposerFunctions->walprop_finish(conn)
|
||||
#define walprop_async_read(conn, buf, amount) \
|
||||
WalProposerFunctions->walprop_async_read(conn, buf, amount)
|
||||
#define walprop_async_write(conn, buf, size) \
|
||||
WalProposerFunctions->walprop_async_write(conn, buf, size)
|
||||
#define walprop_blocking_write(conn, buf, size) \
|
||||
WalProposerFunctions->walprop_blocking_write(conn, buf, size)
|
||||
|
||||
/*
|
||||
* The runtime location of the libpqwalproposer functions.
|
||||
*
|
||||
* This pointer is set by the initializer in libpqwalproposer, so that we
|
||||
* can use it later.
|
||||
*/
|
||||
extern PGDLLIMPORT WalProposerFunctionsType *WalProposerFunctions;
|
||||
|
||||
#endif /* __NEON_WALPROPOSER_H__ */
|
||||
1110
pgxn/neon/walproposer_utils.c
Normal file
1110
pgxn/neon/walproposer_utils.c
Normal file
File diff suppressed because it is too large
Load Diff
19
pgxn/neon/walproposer_utils.h
Normal file
19
pgxn/neon/walproposer_utils.h
Normal file
@@ -0,0 +1,19 @@
|
||||
#ifndef __NEON_WALPROPOSER_UTILS_H__
|
||||
#define __NEON_WALPROPOSER_UTILS_H__
|
||||
|
||||
#include "walproposer.h"
|
||||
|
||||
int CompareLsn(const void *a, const void *b);
|
||||
char* FormatSafekeeperState(SafekeeperState state);
|
||||
void AssertEventsOkForState(uint32 events, Safekeeper* sk);
|
||||
uint32 SafekeeperStateDesiredEvents(SafekeeperState state);
|
||||
char* FormatEvents(uint32 events);
|
||||
bool HexDecodeString(uint8 *result, char *input, int nbytes);
|
||||
uint32 pq_getmsgint32_le(StringInfo msg);
|
||||
uint64 pq_getmsgint64_le(StringInfo msg);
|
||||
void pq_sendint32_le(StringInfo buf, uint32 i);
|
||||
void pq_sendint64_le(StringInfo buf, uint64 i);
|
||||
void XLogWalPropWrite(char *buf, Size nbytes, XLogRecPtr recptr);
|
||||
void XLogWalPropClose(XLogRecPtr recptr);
|
||||
|
||||
#endif /* __NEON_WALPROPOSER_UTILS_H__ */
|
||||
15
pgxn/neon_test_utils/Makefile
Normal file
15
pgxn/neon_test_utils/Makefile
Normal file
@@ -0,0 +1,15 @@
|
||||
# pgxs/neon_test_utils/Makefile
|
||||
|
||||
|
||||
MODULE_big = neon_test_utils
|
||||
OBJS = \
|
||||
$(WIN32RES) \
|
||||
neontest.o
|
||||
|
||||
EXTENSION = neon_test_utils
|
||||
DATA = neon_test_utils--1.0.sql
|
||||
PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
include $(PGXS)
|
||||
29
pgxn/neon_test_utils/neon_test_utils--1.0.sql
Normal file
29
pgxn/neon_test_utils/neon_test_utils--1.0.sql
Normal file
@@ -0,0 +1,29 @@
|
||||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION neon_test_utils" to load this file. \quit
|
||||
|
||||
CREATE FUNCTION test_consume_xids(nxids int)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'test_consume_xids'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION clear_buffer_cache()
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'clear_buffer_cache'
|
||||
LANGUAGE C STRICT
|
||||
PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION get_raw_page_at_lsn(relname text, forkname text, blocknum int8, lsn pg_lsn)
|
||||
RETURNS bytea
|
||||
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn'
|
||||
LANGUAGE C PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION get_raw_page_at_lsn(tbspc oid, db oid, relfilenode oid, forknum int8, blocknum int8, lsn pg_lsn)
|
||||
RETURNS bytea
|
||||
AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
|
||||
LANGUAGE C PARALLEL UNSAFE;
|
||||
|
||||
CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
|
||||
RETURNS VOID
|
||||
AS 'MODULE_PATHNAME', 'neon_xlogflush'
|
||||
LANGUAGE C PARALLEL UNSAFE;
|
||||
5
pgxn/neon_test_utils/neon_test_utils.control
Normal file
5
pgxn/neon_test_utils/neon_test_utils.control
Normal file
@@ -0,0 +1,5 @@
|
||||
# neon_test_utils extension
|
||||
comment = 'helpers for neon testing and debugging'
|
||||
default_version = '1.0'
|
||||
module_pathname = '$libdir/neon_test_utils'
|
||||
relocatable = true
|
||||
304
pgxn/neon_test_utils/neontest.c
Normal file
304
pgxn/neon_test_utils/neontest.c
Normal file
@@ -0,0 +1,304 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* neontest.c
|
||||
* Helpers for neon testing and debugging
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* contrib/neon_test_utils/neontest.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/relation.h"
|
||||
#include "access/xact.h"
|
||||
#include "access/xlog.h"
|
||||
#include "catalog/namespace.h"
|
||||
#include "fmgr.h"
|
||||
#include "funcapi.h"
|
||||
#include "miscadmin.h"
|
||||
#include "storage/buf_internals.h"
|
||||
#include "storage/bufmgr.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/pg_lsn.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/varlena.h"
|
||||
#include "../neon/pagestore_client.h"
|
||||
|
||||
PG_MODULE_MAGIC;
|
||||
|
||||
extern void _PG_init(void);
|
||||
|
||||
PG_FUNCTION_INFO_V1(test_consume_xids);
|
||||
PG_FUNCTION_INFO_V1(clear_buffer_cache);
|
||||
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
|
||||
PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
|
||||
PG_FUNCTION_INFO_V1(neon_xlogflush);
|
||||
|
||||
/*
|
||||
* Linkage to functions in zenith module.
|
||||
* The signature here would need to be updated whenever function parameters change in pagestore_smgr.c
|
||||
*/
|
||||
typedef void (*zenith_read_at_lsn_type)(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
XLogRecPtr request_lsn, bool request_latest, char *buffer);
|
||||
|
||||
static zenith_read_at_lsn_type zenith_read_at_lsn_ptr;
|
||||
|
||||
/*
|
||||
* Module initialize function: fetch function pointers for cross-module calls.
|
||||
*/
|
||||
void
|
||||
_PG_init(void)
|
||||
{
|
||||
/* Asserts verify that typedefs above match original declarations */
|
||||
AssertVariableIsOfType(&zenith_read_at_lsn, zenith_read_at_lsn_type);
|
||||
zenith_read_at_lsn_ptr = (zenith_read_at_lsn_type)
|
||||
load_external_function("$libdir/neon", "zenith_read_at_lsn",
|
||||
true, NULL);
|
||||
}
|
||||
|
||||
#define zenith_read_at_lsn zenith_read_at_lsn_ptr
|
||||
|
||||
/*
|
||||
* test_consume_xids(int4), for rapidly consuming XIDs, to test wraparound.
|
||||
*/
|
||||
Datum
|
||||
test_consume_xids(PG_FUNCTION_ARGS)
|
||||
{
|
||||
int32 nxids = PG_GETARG_INT32(0);
|
||||
TransactionId topxid;
|
||||
FullTransactionId fullxid;
|
||||
TransactionId xid;
|
||||
TransactionId targetxid;
|
||||
|
||||
/* make sure we have a top-XID first */
|
||||
topxid = GetTopTransactionId();
|
||||
|
||||
xid = ReadNextTransactionId();
|
||||
|
||||
targetxid = xid + nxids;
|
||||
while (targetxid < FirstNormalTransactionId)
|
||||
targetxid++;
|
||||
|
||||
while (TransactionIdPrecedes(xid, targetxid))
|
||||
{
|
||||
fullxid = GetNewTransactionId(true);
|
||||
xid = XidFromFullTransactionId(fullxid);
|
||||
elog(DEBUG1, "topxid: %u xid: %u", topxid, xid);
|
||||
}
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush the buffer cache, evicting all pages that are not currently pinned.
|
||||
*/
|
||||
Datum
|
||||
clear_buffer_cache(PG_FUNCTION_ARGS)
|
||||
{
|
||||
bool save_zenith_test_evict;
|
||||
|
||||
/*
|
||||
* Temporarily set the zenith_test_evict GUC, so that when we pin and
|
||||
* unpin a buffer, the buffer is evicted. We use that hack to evict all
|
||||
* buffers, as there is no explicit "evict this buffer" function in the
|
||||
* buffer manager.
|
||||
*/
|
||||
save_zenith_test_evict = zenith_test_evict;
|
||||
zenith_test_evict = true;
|
||||
PG_TRY();
|
||||
{
|
||||
/* Scan through all the buffers */
|
||||
for (int i = 0; i < NBuffers; i++)
|
||||
{
|
||||
BufferDesc *bufHdr;
|
||||
uint32 buf_state;
|
||||
Buffer bufferid;
|
||||
bool isvalid;
|
||||
RelFileNode rnode;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blocknum;
|
||||
|
||||
/* Peek into the buffer header to see what page it holds. */
|
||||
bufHdr = GetBufferDescriptor(i);
|
||||
buf_state = LockBufHdr(bufHdr);
|
||||
|
||||
if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
|
||||
isvalid = true;
|
||||
else
|
||||
isvalid = false;
|
||||
bufferid = BufferDescriptorGetBuffer(bufHdr);
|
||||
rnode = bufHdr->tag.rnode;
|
||||
forknum = bufHdr->tag.forkNum;
|
||||
blocknum = bufHdr->tag.blockNum;
|
||||
|
||||
UnlockBufHdr(bufHdr, buf_state);
|
||||
|
||||
/*
|
||||
* Pin the buffer, and release it again. Because we have
|
||||
* zenith_test_evict==true, this will evict the page from
|
||||
* the buffer cache if no one else is holding a pin on it.
|
||||
*/
|
||||
if (isvalid)
|
||||
{
|
||||
if (ReadRecentBuffer(rnode, forknum, blocknum, bufferid))
|
||||
ReleaseBuffer(bufferid);
|
||||
}
|
||||
}
|
||||
}
|
||||
PG_FINALLY();
|
||||
{
|
||||
/* restore the GUC */
|
||||
zenith_test_evict = save_zenith_test_evict;
|
||||
}
|
||||
PG_END_TRY();
|
||||
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Reads the page from page server without buffer cache
|
||||
* usage mimics get_raw_page() in pageinspect, but offers reading versions at specific LSN
|
||||
* NULL read lsn will result in reading the latest version.
|
||||
*
|
||||
* Note: reading latest version will result in waiting for latest changes to reach the page server,
|
||||
* if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
|
||||
*/
|
||||
Datum
|
||||
get_raw_page_at_lsn(PG_FUNCTION_ARGS)
|
||||
{
|
||||
bytea *raw_page;
|
||||
ForkNumber forknum;
|
||||
RangeVar *relrv;
|
||||
Relation rel;
|
||||
char *raw_page_data;
|
||||
text *relname;
|
||||
text *forkname;
|
||||
uint32 blkno;
|
||||
|
||||
bool request_latest = PG_ARGISNULL(3);
|
||||
uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(3);
|
||||
|
||||
if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2))
|
||||
PG_RETURN_NULL();
|
||||
|
||||
relname = PG_GETARG_TEXT_PP(0);
|
||||
forkname = PG_GETARG_TEXT_PP(1);
|
||||
blkno = PG_GETARG_UINT32(2);
|
||||
|
||||
if (!superuser())
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("must be superuser to use raw page functions")));
|
||||
|
||||
relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname));
|
||||
rel = relation_openrv(relrv, AccessShareLock);
|
||||
|
||||
/* Check that this relation has storage */
|
||||
if (rel->rd_rel->relkind == RELKIND_VIEW)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
||||
errmsg("cannot get raw page from view \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
if (rel->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
||||
errmsg("cannot get raw page from composite type \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
||||
errmsg("cannot get raw page from foreign table \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
||||
errmsg("cannot get raw page from partitioned table \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
if (rel->rd_rel->relkind == RELKIND_PARTITIONED_INDEX)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_WRONG_OBJECT_TYPE),
|
||||
errmsg("cannot get raw page from partitioned index \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
|
||||
/*
|
||||
* Reject attempts to read non-local temporary relations; we would be
|
||||
* likely to get wrong data since we have no visibility into the owning
|
||||
* session's local buffers.
|
||||
*/
|
||||
if (RELATION_IS_OTHER_TEMP(rel))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("cannot access temporary tables of other sessions")));
|
||||
|
||||
|
||||
forknum = forkname_to_number(text_to_cstring(forkname));
|
||||
|
||||
/* Initialize buffer to copy to */
|
||||
raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
|
||||
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
|
||||
raw_page_data = VARDATA(raw_page);
|
||||
|
||||
zenith_read_at_lsn(rel->rd_node, forknum, blkno, read_lsn, request_latest, raw_page_data);
|
||||
|
||||
relation_close(rel, AccessShareLock);
|
||||
|
||||
PG_RETURN_BYTEA_P(raw_page);
|
||||
}
|
||||
|
||||
/*
|
||||
* Another option to read a relation page from page server without cache
|
||||
* this version doesn't validate input and allows reading blocks of dropped relations
|
||||
*
|
||||
* Note: reading latest version will result in waiting for latest changes to reach the page server,
|
||||
* if this is undesirable, use pageinspect' get_raw_page that uses buffered access to the latest page
|
||||
*/
|
||||
Datum
|
||||
get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
|
||||
{
|
||||
char *raw_page_data;
|
||||
|
||||
if (!superuser())
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("must be superuser to use raw page functions")));
|
||||
|
||||
if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) ||
|
||||
PG_ARGISNULL(3) || PG_ARGISNULL(4))
|
||||
PG_RETURN_NULL();
|
||||
|
||||
{
|
||||
RelFileNode rnode = {
|
||||
.spcNode = PG_GETARG_OID(0),
|
||||
.dbNode = PG_GETARG_OID(1),
|
||||
.relNode = PG_GETARG_OID(2)
|
||||
};
|
||||
|
||||
ForkNumber forknum = PG_GETARG_UINT32(3);
|
||||
|
||||
uint32 blkno = PG_GETARG_UINT32(4);
|
||||
bool request_latest = PG_ARGISNULL(5);
|
||||
uint64 read_lsn = request_latest ? GetXLogInsertRecPtr() : PG_GETARG_INT64(5);
|
||||
|
||||
|
||||
/* Initialize buffer to copy to */
|
||||
bytea *raw_page = (bytea *) palloc(BLCKSZ + VARHDRSZ);
|
||||
SET_VARSIZE(raw_page, BLCKSZ + VARHDRSZ);
|
||||
raw_page_data = VARDATA(raw_page);
|
||||
|
||||
zenith_read_at_lsn(rnode, forknum, blkno, read_lsn, request_latest, raw_page_data);
|
||||
PG_RETURN_BYTEA_P(raw_page);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Directly calls XLogFlush(lsn) to flush WAL buffers.
|
||||
*/
|
||||
Datum
|
||||
neon_xlogflush(PG_FUNCTION_ARGS)
|
||||
{
|
||||
XLogRecPtr lsn = PG_GETARG_LSN(0);
|
||||
XLogFlush(lsn);
|
||||
PG_RETURN_VOID();
|
||||
}
|
||||
294
poetry.lock
generated
294
poetry.lock
generated
File diff suppressed because one or more lines are too long
@@ -1,11 +1,10 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from typing import List
|
||||
import argparse
|
||||
import enum
|
||||
import subprocess
|
||||
import sys
|
||||
import enum
|
||||
import argparse
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
|
||||
@enum.unique
|
||||
@@ -37,15 +36,24 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
|
||||
return cmd
|
||||
|
||||
|
||||
def yapf(fix_inplace: bool) -> str:
|
||||
cmd = "poetry run yapf --recursive"
|
||||
if fix_inplace:
|
||||
cmd += " --in-place"
|
||||
else:
|
||||
cmd += " --diff"
|
||||
def black(fix_inplace: bool) -> str:
|
||||
cmd = "poetry run black"
|
||||
if not fix_inplace:
|
||||
cmd += " --diff --check"
|
||||
return cmd
|
||||
|
||||
|
||||
def isort(fix_inplace: bool) -> str:
|
||||
cmd = "poetry run isort"
|
||||
if not fix_inplace:
|
||||
cmd += " --diff --check"
|
||||
return cmd
|
||||
|
||||
|
||||
def flake8() -> str:
|
||||
return "poetry run flake8"
|
||||
|
||||
|
||||
def mypy() -> str:
|
||||
return "poetry run mypy"
|
||||
|
||||
@@ -71,11 +79,13 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
|
||||
else:
|
||||
print("Please inspect the output below and run make fmt to fix automatically.")
|
||||
if suffix == ".py":
|
||||
print("If the output is empty, ensure that you've installed Python tooling by\n"
|
||||
"running './scripts/pysync' in the current directory (no root needed)")
|
||||
print(
|
||||
"If the output is empty, ensure that you've installed Python tooling by\n"
|
||||
"running './scripts/pysync' in the current directory (no root needed)"
|
||||
)
|
||||
print()
|
||||
print(res.stdout.decode())
|
||||
exit(1)
|
||||
sys.exit(1)
|
||||
|
||||
print(colorify("[OK]", Color.GREEN, no_color))
|
||||
|
||||
@@ -83,10 +93,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace")
|
||||
parser.add_argument("--no-color",
|
||||
action="store_true",
|
||||
help="disable colored output",
|
||||
default=not sys.stdout.isatty())
|
||||
parser.add_argument(
|
||||
"--no-color",
|
||||
action="store_true",
|
||||
help="disable colored output",
|
||||
default=not sys.stdout.isatty(),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
files = get_commit_files()
|
||||
@@ -101,9 +113,23 @@ if __name__ == "__main__":
|
||||
no_color=args.no_color,
|
||||
)
|
||||
check(
|
||||
name="yapf",
|
||||
name="isort",
|
||||
suffix=".py",
|
||||
cmd=yapf(fix_inplace=args.fix_inplace),
|
||||
cmd=isort(fix_inplace=args.fix_inplace),
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
check(
|
||||
name="black",
|
||||
suffix=".py",
|
||||
cmd=black(fix_inplace=args.fix_inplace),
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
check(
|
||||
name="flake8",
|
||||
suffix=".py",
|
||||
cmd=flake8(),
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
|
||||
@@ -11,10 +11,11 @@ bstr = "0.2.17"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
clap = "3.0"
|
||||
futures = "0.3.13"
|
||||
hashbrown = "0.11.2"
|
||||
hashbrown = "0.12"
|
||||
hex = "0.4.3"
|
||||
hmac = "0.12.1"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
once_cell = "1.13.0"
|
||||
md5 = "0.7.0"
|
||||
parking_lot = "0.12"
|
||||
@@ -23,7 +24,7 @@ rand = "0.8.3"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
routerify = "3"
|
||||
rustls = "0.20.0"
|
||||
rustls-pemfile = "0.2.1"
|
||||
rustls-pemfile = "1"
|
||||
scopeguard = "1.1.0"
|
||||
serde = "1"
|
||||
serde_json = "1"
|
||||
|
||||
@@ -127,7 +127,7 @@ impl<T, E> BackendType<Result<T, E>> {
|
||||
}
|
||||
}
|
||||
|
||||
impl BackendType<ClientCredentials> {
|
||||
impl BackendType<ClientCredentials<'_>> {
|
||||
/// Authenticate the client via the requested backend, possibly using credentials.
|
||||
pub async fn authenticate(
|
||||
mut self,
|
||||
@@ -149,7 +149,7 @@ impl BackendType<ClientCredentials> {
|
||||
|
||||
// Finally we may finish the initialization of `creds`.
|
||||
// TODO: add missing type safety to ClientCredentials.
|
||||
creds.project = Some(payload.project);
|
||||
creds.project = Some(payload.project.into());
|
||||
|
||||
let mut config = match &self {
|
||||
Console(creds) => {
|
||||
|
||||
@@ -121,7 +121,7 @@ pub enum AuthInfo {
|
||||
#[must_use]
|
||||
pub(super) struct Api<'a> {
|
||||
endpoint: &'a ApiUrl,
|
||||
creds: &'a ClientCredentials,
|
||||
creds: &'a ClientCredentials<'a>,
|
||||
}
|
||||
|
||||
impl<'a> Api<'a> {
|
||||
@@ -143,7 +143,7 @@ impl<'a> Api<'a> {
|
||||
url.path_segments_mut().push("proxy_get_role_secret");
|
||||
url.query_pairs_mut()
|
||||
.append_pair("project", self.creds.project().expect("impossible"))
|
||||
.append_pair("role", &self.creds.user);
|
||||
.append_pair("role", self.creds.user);
|
||||
|
||||
// TODO: use a proper logger
|
||||
println!("cplane request: {url}");
|
||||
@@ -187,8 +187,8 @@ impl<'a> Api<'a> {
|
||||
config
|
||||
.host(host)
|
||||
.port(port)
|
||||
.dbname(&self.creds.dbname)
|
||||
.user(&self.creds.user);
|
||||
.dbname(self.creds.dbname)
|
||||
.user(self.creds.user);
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
@@ -56,7 +56,7 @@ enum ProxyAuthResponse {
|
||||
NotReady { ready: bool }, // TODO: get rid of `ready`
|
||||
}
|
||||
|
||||
impl ClientCredentials {
|
||||
impl ClientCredentials<'_> {
|
||||
fn is_existing_user(&self) -> bool {
|
||||
self.user.ends_with("@zenith")
|
||||
}
|
||||
@@ -64,15 +64,15 @@ impl ClientCredentials {
|
||||
|
||||
async fn authenticate_proxy_client(
|
||||
auth_endpoint: &reqwest::Url,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ClientCredentials<'_>,
|
||||
md5_response: &str,
|
||||
salt: &[u8; 4],
|
||||
psql_session_id: &str,
|
||||
) -> Result<DatabaseInfo, LegacyAuthError> {
|
||||
let mut url = auth_endpoint.clone();
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", &creds.user)
|
||||
.append_pair("database", &creds.dbname)
|
||||
.append_pair("login", creds.user)
|
||||
.append_pair("database", creds.dbname)
|
||||
.append_pair("md5response", md5_response)
|
||||
.append_pair("salt", &hex::encode(salt))
|
||||
.append_pair("psql_session_id", psql_session_id);
|
||||
@@ -103,7 +103,7 @@ async fn authenticate_proxy_client(
|
||||
async fn handle_existing_user(
|
||||
auth_endpoint: &reqwest::Url,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ClientCredentials<'_>,
|
||||
) -> auth::Result<compute::NodeInfo> {
|
||||
let psql_session_id = super::link::new_psql_session_id();
|
||||
let md5_salt = rand::random();
|
||||
@@ -136,7 +136,7 @@ async fn handle_existing_user(
|
||||
pub async fn handle_user(
|
||||
auth_endpoint: &reqwest::Url,
|
||||
auth_link_uri: &reqwest::Url,
|
||||
creds: &ClientCredentials,
|
||||
creds: &ClientCredentials<'_>,
|
||||
client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin + Send>,
|
||||
) -> auth::Result<compute::NodeInfo> {
|
||||
if creds.is_existing_user() {
|
||||
|
||||
@@ -17,7 +17,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
|
||||
#[must_use]
|
||||
pub(super) struct Api<'a> {
|
||||
endpoint: &'a ApiUrl,
|
||||
creds: &'a ClientCredentials,
|
||||
creds: &'a ClientCredentials<'a>,
|
||||
}
|
||||
|
||||
// Helps eliminate graceless `.map_err` calls without introducing another ctor.
|
||||
@@ -87,8 +87,8 @@ impl<'a> Api<'a> {
|
||||
config
|
||||
.host(self.endpoint.host_str().unwrap_or("localhost"))
|
||||
.port(self.endpoint.port().unwrap_or(5432))
|
||||
.dbname(&self.creds.dbname)
|
||||
.user(&self.creds.user);
|
||||
.dbname(self.creds.dbname)
|
||||
.user(self.creds.user);
|
||||
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
//! User credentials used in authentication.
|
||||
|
||||
use crate::error::UserFacingError;
|
||||
use std::borrow::Cow;
|
||||
use thiserror::Error;
|
||||
use utils::pq_proto::StartupMessageParams;
|
||||
|
||||
@@ -27,51 +28,59 @@ impl UserFacingError for ClientCredsParseError {}
|
||||
/// Various client credentials which we use for authentication.
|
||||
/// Note that we don't store any kind of client key or password here.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ClientCredentials {
|
||||
pub user: String,
|
||||
pub dbname: String,
|
||||
pub project: Option<String>,
|
||||
pub struct ClientCredentials<'a> {
|
||||
pub user: &'a str,
|
||||
pub dbname: &'a str,
|
||||
pub project: Option<Cow<'a, str>>,
|
||||
}
|
||||
|
||||
impl ClientCredentials {
|
||||
impl ClientCredentials<'_> {
|
||||
pub fn project(&self) -> Option<&str> {
|
||||
self.project.as_deref()
|
||||
}
|
||||
}
|
||||
|
||||
impl ClientCredentials {
|
||||
impl<'a> ClientCredentials<'a> {
|
||||
pub fn parse(
|
||||
mut options: StartupMessageParams,
|
||||
params: &'a StartupMessageParams,
|
||||
sni: Option<&str>,
|
||||
common_name: Option<&str>,
|
||||
) -> Result<Self, ClientCredsParseError> {
|
||||
use ClientCredsParseError::*;
|
||||
|
||||
// Some parameters are absolutely necessary, others not so much.
|
||||
let mut get_param = |key| options.remove(key).ok_or(MissingKey(key));
|
||||
|
||||
// Some parameters are stored in the startup message.
|
||||
let get_param = |key| params.get(key).ok_or(MissingKey(key));
|
||||
let user = get_param("user")?;
|
||||
let dbname = get_param("database")?;
|
||||
let project_a = get_param("project").ok();
|
||||
|
||||
// Project name might be passed via PG's command-line options.
|
||||
let project_a = params.options_raw().and_then(|options| {
|
||||
for opt in options {
|
||||
if let Some(value) = opt.strip_prefix("project=") {
|
||||
return Some(Cow::Borrowed(value));
|
||||
}
|
||||
}
|
||||
None
|
||||
});
|
||||
|
||||
// Alternative project name is in fact a subdomain from SNI.
|
||||
// NOTE: we do not consider SNI if `common_name` is missing.
|
||||
let project_b = sni
|
||||
.zip(common_name)
|
||||
.map(|(sni, cn)| {
|
||||
// TODO: what if SNI is present but just a common name?
|
||||
subdomain_from_sni(sni, cn)
|
||||
.ok_or_else(|| InconsistentSni(sni.to_owned(), cn.to_owned()))
|
||||
.ok_or_else(|| InconsistentSni(sni.into(), cn.into()))
|
||||
.map(Cow::<'static, str>::Owned)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
let project = match (project_a, project_b) {
|
||||
// Invariant: if we have both project name variants, they should match.
|
||||
(Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a, b))),
|
||||
(a, b) => a.or(b).map(|name| {
|
||||
// Invariant: project name may not contain certain characters.
|
||||
check_project_name(name).map_err(MalformedProjectName)
|
||||
(Some(a), Some(b)) if a != b => Some(Err(InconsistentProjectNames(a.into(), b.into()))),
|
||||
// Invariant: project name may not contain certain characters.
|
||||
(a, b) => a.or(b).map(|name| match project_name_valid(&name) {
|
||||
false => Err(MalformedProjectName(name.into())),
|
||||
true => Ok(name),
|
||||
}),
|
||||
}
|
||||
.transpose()?;
|
||||
@@ -84,12 +93,8 @@ impl ClientCredentials {
|
||||
}
|
||||
}
|
||||
|
||||
fn check_project_name(name: String) -> Result<String, String> {
|
||||
if name.chars().all(|c| c.is_alphanumeric() || c == '-') {
|
||||
Ok(name)
|
||||
} else {
|
||||
Err(name)
|
||||
}
|
||||
fn project_name_valid(name: &str) -> bool {
|
||||
name.chars().all(|c| c.is_alphanumeric() || c == '-')
|
||||
}
|
||||
|
||||
fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
|
||||
@@ -102,18 +107,14 @@ fn subdomain_from_sni(sni: &str, common_name: &str) -> Option<String> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_options<'a, const N: usize>(pairs: [(&'a str, &'a str); N]) -> StartupMessageParams {
|
||||
StartupMessageParams::from(pairs.map(|(k, v)| (k.to_owned(), v.to_owned())))
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore = "TODO: fix how database is handled"]
|
||||
fn parse_bare_minimum() -> anyhow::Result<()> {
|
||||
// According to postgresql, only `user` should be required.
|
||||
let options = make_options([("user", "john_doe")]);
|
||||
let options = StartupMessageParams::new([("user", "john_doe")]);
|
||||
|
||||
// TODO: check that `creds.dbname` is None.
|
||||
let creds = ClientCredentials::parse(options, None, None)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
|
||||
Ok(())
|
||||
@@ -121,9 +122,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_missing_project() -> anyhow::Result<()> {
|
||||
let options = make_options([("user", "john_doe"), ("database", "world")]);
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
|
||||
|
||||
let creds = ClientCredentials::parse(options, None, None)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project, None);
|
||||
@@ -133,12 +134,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_project_from_sni() -> anyhow::Result<()> {
|
||||
let options = make_options([("user", "john_doe"), ("database", "world")]);
|
||||
let options = StartupMessageParams::new([("user", "john_doe"), ("database", "world")]);
|
||||
|
||||
let sni = Some("foo.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(options, sni, common_name)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("foo"));
|
||||
@@ -148,13 +149,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_project_from_options() -> anyhow::Result<()> {
|
||||
let options = make_options([
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("database", "world"),
|
||||
("project", "bar"),
|
||||
("options", "-ckey=1 project=bar -c geqo=off"),
|
||||
]);
|
||||
|
||||
let creds = ClientCredentials::parse(options, None, None)?;
|
||||
let creds = ClientCredentials::parse(&options, None, None)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("bar"));
|
||||
@@ -164,16 +165,16 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_projects_identical() -> anyhow::Result<()> {
|
||||
let options = make_options([
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("database", "world"),
|
||||
("project", "baz"),
|
||||
("options", "project=baz"),
|
||||
]);
|
||||
|
||||
let sni = Some("baz.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
let creds = ClientCredentials::parse(options, sni, common_name)?;
|
||||
let creds = ClientCredentials::parse(&options, sni, common_name)?;
|
||||
assert_eq!(creds.user, "john_doe");
|
||||
assert_eq!(creds.dbname, "world");
|
||||
assert_eq!(creds.project.as_deref(), Some("baz"));
|
||||
@@ -183,17 +184,17 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_projects_different() {
|
||||
let options = make_options([
|
||||
let options = StartupMessageParams::new([
|
||||
("user", "john_doe"),
|
||||
("database", "world"),
|
||||
("project", "first"),
|
||||
("options", "project=first"),
|
||||
]);
|
||||
|
||||
let sni = Some("second.localhost");
|
||||
let common_name = Some("localhost");
|
||||
|
||||
assert!(matches!(
|
||||
ClientCredentials::parse(options, sni, common_name).expect_err("should fail"),
|
||||
ClientCredentials::parse(&options, sni, common_name).expect_err("should fail"),
|
||||
ClientCredsParseError::InconsistentProjectNames(_, _)
|
||||
));
|
||||
}
|
||||
|
||||
@@ -95,7 +95,7 @@ impl<'a> Session<'a> {
|
||||
|
||||
/// Store the cancel token for the given session.
|
||||
/// This enables query cancellation in [`crate::proxy::handshake`].
|
||||
pub fn enable_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
pub fn enable_query_cancellation(self, cancel_closure: CancelClosure) -> CancelKeyData {
|
||||
self.cancel_map
|
||||
.0
|
||||
.lock()
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
use crate::{cancellation::CancelClosure, error::UserFacingError};
|
||||
use futures::TryFutureExt;
|
||||
use itertools::Itertools;
|
||||
use std::{io, net::SocketAddr};
|
||||
use thiserror::Error;
|
||||
use tokio::net::TcpStream;
|
||||
use tokio_postgres::NoTls;
|
||||
use utils::pq_proto::StartupMessageParams;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ConnectionError {
|
||||
@@ -110,7 +112,42 @@ pub struct PostgresConnection {
|
||||
|
||||
impl NodeInfo {
|
||||
/// Connect to a corresponding compute node.
|
||||
pub async fn connect(&self) -> Result<(PostgresConnection, CancelClosure), ConnectionError> {
|
||||
pub async fn connect(
|
||||
mut self,
|
||||
params: &StartupMessageParams,
|
||||
) -> Result<(PostgresConnection, CancelClosure), ConnectionError> {
|
||||
if let Some(options) = params.options_raw() {
|
||||
// We must drop all proxy-specific parameters.
|
||||
#[allow(unstable_name_collisions)]
|
||||
let options: String = options
|
||||
.filter(|opt| !opt.starts_with("project="))
|
||||
.intersperse(" ") // TODO: use impl from std once it's stabilized
|
||||
.collect();
|
||||
|
||||
self.config.options(&options);
|
||||
}
|
||||
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
self.config.application_name(app_name);
|
||||
}
|
||||
|
||||
if let Some(replication) = params.get("replication") {
|
||||
use tokio_postgres::config::ReplicationMode;
|
||||
match replication {
|
||||
"true" | "on" | "yes" | "1" => {
|
||||
self.config.replication_mode(ReplicationMode::Physical);
|
||||
}
|
||||
"database" => {
|
||||
self.config.replication_mode(ReplicationMode::Logical);
|
||||
}
|
||||
_other => {}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: extend the list of the forwarded startup parameters.
|
||||
// Currently, tokio-postgres doesn't allow us to pass
|
||||
// arbitrary parameters, but the ones above are a good start.
|
||||
|
||||
let (socket_addr, mut stream) = self
|
||||
.connect_raw()
|
||||
.await
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::auth;
|
||||
use crate::cancellation::{self, CancelMap};
|
||||
use crate::config::{ProxyConfig, TlsConfig};
|
||||
use crate::config::{AuthUrls, ProxyConfig, TlsConfig};
|
||||
use crate::stream::{MetricsStream, PqStream, Stream};
|
||||
use anyhow::{bail, Context};
|
||||
use futures::TryFutureExt;
|
||||
@@ -93,20 +93,21 @@ async fn handle_client(
|
||||
None => return Ok(()), // it's a cancellation request
|
||||
};
|
||||
|
||||
// Extract credentials which we're going to use for auth.
|
||||
let creds = {
|
||||
let sni = stream.get_ref().sni_hostname();
|
||||
let common_name = tls.and_then(|tls| tls.common_name.as_deref());
|
||||
let result = config
|
||||
.auth_backend
|
||||
.map(|_| auth::ClientCredentials::parse(params, sni, common_name))
|
||||
.map(|_| auth::ClientCredentials::parse(¶ms, sni, common_name))
|
||||
.transpose();
|
||||
|
||||
async { result }.or_else(|e| stream.throw_error(e)).await?
|
||||
};
|
||||
|
||||
let client = Client::new(stream, creds);
|
||||
let client = Client::new(stream, creds, ¶ms);
|
||||
cancel_map
|
||||
.with_session(|session| client.connect_to_db(config, session))
|
||||
.with_session(|session| client.connect_to_db(&config.auth_urls, session))
|
||||
.await
|
||||
}
|
||||
|
||||
@@ -174,38 +175,57 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
|
||||
}
|
||||
|
||||
/// Thin connection context.
|
||||
struct Client<S> {
|
||||
struct Client<'a, S> {
|
||||
/// The underlying libpq protocol stream.
|
||||
stream: PqStream<S>,
|
||||
/// Client credentials that we care about.
|
||||
creds: auth::BackendType<auth::ClientCredentials>,
|
||||
creds: auth::BackendType<auth::ClientCredentials<'a>>,
|
||||
/// KV-dictionary with PostgreSQL connection params.
|
||||
params: &'a StartupMessageParams,
|
||||
}
|
||||
|
||||
impl<S> Client<S> {
|
||||
impl<'a, S> Client<'a, S> {
|
||||
/// Construct a new connection context.
|
||||
fn new(stream: PqStream<S>, creds: auth::BackendType<auth::ClientCredentials>) -> Self {
|
||||
Self { stream, creds }
|
||||
fn new(
|
||||
stream: PqStream<S>,
|
||||
creds: auth::BackendType<auth::ClientCredentials<'a>>,
|
||||
params: &'a StartupMessageParams,
|
||||
) -> Self {
|
||||
Self {
|
||||
stream,
|
||||
creds,
|
||||
params,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<S> {
|
||||
impl<S: AsyncRead + AsyncWrite + Unpin + Send> Client<'_, S> {
|
||||
/// Let the client authenticate and connect to the designated compute node.
|
||||
async fn connect_to_db(
|
||||
self,
|
||||
config: &ProxyConfig,
|
||||
urls: &AuthUrls,
|
||||
session: cancellation::Session<'_>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Self { mut stream, creds } = self;
|
||||
let Self {
|
||||
mut stream,
|
||||
creds,
|
||||
params,
|
||||
} = self;
|
||||
|
||||
// Authenticate and connect to a compute node.
|
||||
let auth = creds.authenticate(&config.auth_urls, &mut stream).await;
|
||||
let auth = creds.authenticate(urls, &mut stream).await;
|
||||
let node = async { auth }.or_else(|e| stream.throw_error(e)).await?;
|
||||
let reported_auth_ok = node.reported_auth_ok;
|
||||
|
||||
let (db, cancel_closure) = node.connect().or_else(|e| stream.throw_error(e)).await?;
|
||||
let cancel_key_data = session.enable_cancellation(cancel_closure);
|
||||
let (db, cancel_closure) = node
|
||||
.connect(params)
|
||||
.or_else(|e| stream.throw_error(e))
|
||||
.await?;
|
||||
|
||||
let cancel_key_data = session.enable_query_cancellation(cancel_closure);
|
||||
|
||||
// Report authentication success if we haven't done this already.
|
||||
if !node.reported_auth_ok {
|
||||
if !reported_auth_ok {
|
||||
stream
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&BeParameterStatusMessage::encoding())?;
|
||||
|
||||
@@ -27,12 +27,54 @@ prometheus-client = "^0.14.1"
|
||||
pytest-timeout = "^2.1.0"
|
||||
Werkzeug = "2.1.2"
|
||||
pytest-order = "^1.0.1"
|
||||
allure-pytest = "^2.10.0"
|
||||
pytest-asyncio = "^0.19.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
yapf = "==0.31.0"
|
||||
flake8 = "^3.9.2"
|
||||
flake8 = "^5.0.4"
|
||||
mypy = "==0.971"
|
||||
black = "^22.6.0"
|
||||
isort = "^5.10.1"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.black]
|
||||
line-length = 100
|
||||
extend-exclude = '''
|
||||
/(
|
||||
vendor
|
||||
)/
|
||||
'''
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
line_length = 100
|
||||
skip_gitignore = true
|
||||
skip = [
|
||||
"vendor",
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
# mypy uses regex
|
||||
exclude = "^vendor/"
|
||||
# some tests don't typecheck when this flag is set
|
||||
check_untyped_defs = false
|
||||
# Help mypy find imports when running against list of individual files.
|
||||
# Without this line it would behave differently when executed on the entire project.
|
||||
mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner"
|
||||
|
||||
disallow_incomplete_defs = false
|
||||
disallow_untyped_calls = false
|
||||
disallow_untyped_decorators = false
|
||||
disallow_untyped_defs = false
|
||||
strict = true
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = [
|
||||
"asyncpg.*",
|
||||
"cached_property.*",
|
||||
"pg8000.*",
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
@@ -13,10 +13,10 @@
|
||||
# avoid running regular linting script that checks every feature.
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
# no extra features to test currently, add more here when needed
|
||||
cargo clippy --all --all-targets -- -A unknown_lints -D warnings
|
||||
cargo clippy --locked --all --all-targets -- -A unknown_lints -D warnings
|
||||
else
|
||||
# * `-A unknown_lints` – do not warn about unknown lint suppressions
|
||||
# that people with newer toolchains might use
|
||||
# * `-D warnings` - fail on any warnings (`cargo` returns non-zero exit status)
|
||||
cargo clippy --all --all-targets --all-features -- -A unknown_lints -D warnings
|
||||
cargo clippy --locked --all --all-targets --all-features -- -A unknown_lints -D warnings
|
||||
fi
|
||||
|
||||
@@ -11,7 +11,6 @@ use anyhow::{bail, Context, Result};
|
||||
|
||||
use postgres_ffi::PG_TLI;
|
||||
use regex::Regex;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use tracing::info;
|
||||
use utils::{
|
||||
@@ -67,18 +66,22 @@ impl postgres_backend::Handler for SafekeeperPostgresHandler {
|
||||
// ztenant id and ztimeline id are passed in connection string params
|
||||
fn startup(&mut self, _pgb: &mut PostgresBackend, sm: &FeStartupPacket) -> Result<()> {
|
||||
if let FeStartupPacket::StartupMessage { params, .. } = sm {
|
||||
self.ztenantid = match params.get("ztenantid") {
|
||||
Some(z) => Some(ZTenantId::from_str(z)?), // just curious, can I do that from .map?
|
||||
_ => None,
|
||||
};
|
||||
|
||||
self.ztimelineid = match params.get("ztimelineid") {
|
||||
Some(z) => Some(ZTimelineId::from_str(z)?),
|
||||
_ => None,
|
||||
};
|
||||
if let Some(options) = params.options_raw() {
|
||||
for opt in options {
|
||||
match opt.split_once('=') {
|
||||
Some(("ztenantid", value)) => {
|
||||
self.ztenantid = Some(value.parse()?);
|
||||
}
|
||||
Some(("ztimelineid", value)) => {
|
||||
self.ztimelineid = Some(value.parse()?);
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(app_name) = params.get("application_name") {
|
||||
self.appname = Some(app_name.clone());
|
||||
self.appname = Some(app_name.to_owned());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user