mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-14 17:02:56 +00:00
Merge branch 'main' into ps-trace
This commit is contained in:
@@ -11,3 +11,6 @@ opt-level = 3
|
||||
[profile.dev]
|
||||
# Turn on a small amount of optimization in Development mode.
|
||||
opt-level = 1
|
||||
|
||||
[alias]
|
||||
build_testing = ["build", "--features", "testing"]
|
||||
|
||||
@@ -18,3 +18,4 @@
|
||||
!vendor/postgres-v15/
|
||||
!workspace_hack/
|
||||
!neon_local/
|
||||
!scripts/ninstall.sh
|
||||
|
||||
9
.github/actions/download/action.yml
vendored
9
.github/actions/download/action.yml
vendored
@@ -12,6 +12,9 @@ inputs:
|
||||
description: "Allow to skip if file doesn't exist, fail otherwise"
|
||||
default: false
|
||||
required: false
|
||||
prefix:
|
||||
description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
required: false
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -23,18 +26,18 @@ runs:
|
||||
TARGET: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
|
||||
SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX%$GITHUB_RUN_ATTEMPT} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
|
||||
echo '::set-output name=SKIPPED::true'
|
||||
exit 0
|
||||
else
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist"
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
@@ -60,6 +60,7 @@ runs:
|
||||
--header "Authorization: Bearer ${API_KEY}" \
|
||||
--data "{
|
||||
\"project\": {
|
||||
\"name\": \"Created by actions/neon-project-create; GITHUB_RUN_ID=${GITHUB_RUN_ID}\",
|
||||
\"platform_id\": \"aws\",
|
||||
\"region_id\": \"${REGION_ID}\",
|
||||
\"settings\": { }
|
||||
|
||||
19
.github/actions/run-python-test-set/action.yml
vendored
19
.github/actions/run-python-test-set/action.yml
vendored
@@ -85,7 +85,8 @@ runs:
|
||||
# PLATFORM will be embedded in the perf test report
|
||||
# and it is needed to distinguish different environments
|
||||
export PLATFORM=${PLATFORM:-github-actions-selfhosted}
|
||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install/v14}
|
||||
export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
|
||||
export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
|
||||
|
||||
if [ "${BUILD_TYPE}" = "remote" ]; then
|
||||
export REMOTE_ENV=1
|
||||
@@ -112,10 +113,8 @@ runs:
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
|
||||
mkdir -p "$PERF_REPORT_DIR"
|
||||
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||
fi
|
||||
mkdir -p "$PERF_REPORT_DIR"
|
||||
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
@@ -128,7 +127,7 @@ runs:
|
||||
|
||||
# Wake up the cluster if we use remote neon instance
|
||||
if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
|
||||
${POSTGRES_DISTRIB_DIR}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
|
||||
fi
|
||||
|
||||
# Run the tests.
|
||||
@@ -150,11 +149,9 @@ runs:
|
||||
-rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
|
||||
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
|
||||
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||
export REPORT_TO="$PLATFORM"
|
||||
scripts/generate_and_push_perf_report.sh
|
||||
fi
|
||||
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||
export REPORT_TO="$PLATFORM"
|
||||
scripts/generate_and_push_perf_report.sh
|
||||
fi
|
||||
|
||||
- name: Create Allure report
|
||||
|
||||
9
.github/actions/upload/action.yml
vendored
9
.github/actions/upload/action.yml
vendored
@@ -7,6 +7,9 @@ inputs:
|
||||
path:
|
||||
description: "A directory or file to upload"
|
||||
required: true
|
||||
prefix:
|
||||
description: "S3 prefix. Default is '${GITHUB_RUN_ID}/${GITHUB_RUN_ATTEMPT}'"
|
||||
required: false
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
@@ -42,14 +45,14 @@ runs:
|
||||
env:
|
||||
SOURCE: ${{ inputs.path }}
|
||||
ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
|
||||
PREFIX: artifacts/${{ inputs.prefix || format('{0}/{1}', github.run_id, github.run_attempt) }}
|
||||
run: |
|
||||
BUCKET=neon-github-public-dev
|
||||
PREFIX=artifacts/${GITHUB_RUN_ID}
|
||||
FILENAME=$(basename $ARCHIVE)
|
||||
|
||||
FILESIZE=$(du -sh ${ARCHIVE} | cut -f1)
|
||||
|
||||
time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}
|
||||
time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${FILENAME}
|
||||
|
||||
# Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary
|
||||
echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY}
|
||||
echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY}
|
||||
|
||||
4
.github/ansible/deploy.yaml
vendored
4
.github/ansible/deploy.yaml
vendored
@@ -58,7 +58,7 @@
|
||||
creates: "/storage/pageserver/data/tenants"
|
||||
environment:
|
||||
NEON_REPO_DIR: "/storage/pageserver/data"
|
||||
LD_LIBRARY_PATH: "/usr/local/lib"
|
||||
LD_LIBRARY_PATH: "/usr/local/v14/lib"
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
@@ -132,7 +132,7 @@
|
||||
creates: "/storage/safekeeper/data/safekeeper.id"
|
||||
environment:
|
||||
NEON_REPO_DIR: "/storage/safekeeper/data"
|
||||
LD_LIBRARY_PATH: "/usr/local/lib"
|
||||
LD_LIBRARY_PATH: "/usr/local/v14/lib"
|
||||
become: true
|
||||
tags:
|
||||
- safekeeper
|
||||
|
||||
6
.github/ansible/get_binaries.sh
vendored
6
.github/ansible/get_binaries.sh
vendored
@@ -21,10 +21,14 @@ docker pull --quiet neondatabase/neon:${DOCKER_TAG}
|
||||
ID=$(docker create neondatabase/neon:${DOCKER_TAG})
|
||||
docker cp ${ID}:/data/postgres_install.tar.gz .
|
||||
tar -xzf postgres_install.tar.gz -C neon_install
|
||||
mkdir neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/v14/bin/ neon_install/v14/bin/
|
||||
docker cp ${ID}:/usr/local/v15/bin/ neon_install/v15/bin/
|
||||
docker cp ${ID}:/usr/local/v14/lib/ neon_install/v14/lib/
|
||||
docker cp ${ID}:/usr/local/v15/lib/ neon_install/v15/lib/
|
||||
docker rm -vf ${ID}
|
||||
|
||||
# store version to file (for ansible playbooks) and create binaries tarball
|
||||
|
||||
5
.github/ansible/staging.hosts
vendored
5
.github/ansible/staging.hosts
vendored
@@ -2,11 +2,16 @@
|
||||
#zenith-us-stage-ps-1 console_region_id=27
|
||||
zenith-us-stage-ps-2 console_region_id=27
|
||||
zenith-us-stage-ps-3 console_region_id=27
|
||||
zenith-us-stage-ps-4 console_region_id=27
|
||||
zenith-us-stage-test-ps-1 console_region_id=28
|
||||
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-4 console_region_id=27
|
||||
zenith-us-stage-sk-5 console_region_id=27
|
||||
zenith-us-stage-sk-6 console_region_id=27
|
||||
zenith-us-stage-test-sk-1 console_region_id=28
|
||||
zenith-us-stage-test-sk-2 console_region_id=28
|
||||
zenith-us-stage-test-sk-3 console_region_id=28
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
|
||||
2
.github/ansible/systemd/pageserver.service
vendored
2
.github/ansible/systemd/pageserver.service
vendored
@@ -5,7 +5,7 @@ After=network.target auditd.service
|
||||
[Service]
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/v14/lib
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
|
||||
2
.github/ansible/systemd/safekeeper.service
vendored
2
.github/ansible/systemd/safekeeper.service
vendored
@@ -5,7 +5,7 @@ After=network.target auditd.service
|
||||
[Service]
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/v14/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
|
||||
59
.github/workflows/benchmarking.yml
vendored
59
.github/workflows/benchmarking.yml
vendored
@@ -11,7 +11,7 @@ on:
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '36 4 * * *' # run once a day, timezone is utc
|
||||
- cron: '0 3 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
inputs:
|
||||
@@ -19,8 +19,12 @@ on:
|
||||
description: 'Environment to run remote tests on (dev or staging)'
|
||||
required: false
|
||||
region_id:
|
||||
description: 'Use a particular region. If empty the default one will be used'
|
||||
false: true
|
||||
description: 'Use a particular region. If not set the default region will be used'
|
||||
required: false
|
||||
save_perf_report:
|
||||
type: boolean
|
||||
description: 'Publish perf report or not. If not set, the report is published only for the main branch'
|
||||
required: false
|
||||
|
||||
defaults:
|
||||
run:
|
||||
@@ -42,7 +46,8 @@ jobs:
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: "/usr/pgsql-14"
|
||||
POSTGRES_DISTRIB_DIR: /tmp/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
|
||||
steps:
|
||||
- name: Checkout zenith repo
|
||||
@@ -67,7 +72,7 @@ jobs:
|
||||
echo Poetry
|
||||
poetry --version
|
||||
echo Pgbench
|
||||
$POSTGRES_DISTRIB_DIR/bin/pgbench --version
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
|
||||
- name: Create Neon Project
|
||||
id: create-neon-project
|
||||
@@ -136,17 +141,19 @@ jobs:
|
||||
env:
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10gb"
|
||||
POSTGRES_DISTRIB_DIR: /usr
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
DEFAULT_PG_VERSION: 14
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
BUILD_TYPE: remote
|
||||
SAVE_PERF_REPORT: true
|
||||
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref == 'refs/heads/main' ) }}
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# neon-captest: Run pgbench, reusing existing project
|
||||
# neon-captest-new: Same, but on a freshly created project
|
||||
platform: [ neon-captest, neon-captest-new, rds-aurora ]
|
||||
# neon-captest-new: Run pgbench in a freshly created project
|
||||
# neon-captest-reuse: Same, but reusing existing project
|
||||
# neon-captest-prefetch: Same, with prefetching enabled (new project)
|
||||
platform: [ neon-captest-new, neon-captest-reuse, neon-captest-prefetch, rds-aurora ]
|
||||
|
||||
runs-on: dev
|
||||
container:
|
||||
@@ -158,13 +165,20 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Install Deps
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Add Postgres binaries to PATH
|
||||
run: |
|
||||
sudo apt -y update
|
||||
sudo apt install -y postgresql-14
|
||||
${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
|
||||
echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
|
||||
|
||||
- name: Create Neon Project
|
||||
if: matrix.platform == 'neon-captest-new'
|
||||
if: matrix.platform != 'neon-captest-reuse'
|
||||
id: create-neon-project
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
@@ -175,17 +189,17 @@ jobs:
|
||||
id: set-up-connstr
|
||||
run: |
|
||||
case "${PLATFORM}" in
|
||||
neon-captest)
|
||||
neon-captest-reuse)
|
||||
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
|
||||
;;
|
||||
neon-captest-new)
|
||||
neon-captest-new | neon-captest-prefetch)
|
||||
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
|
||||
;;
|
||||
rds-aurora)
|
||||
CONNSTR=${{ secrets.BENCHMARK_RDS_CONNSTR }}
|
||||
;;
|
||||
*)
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest', 'neon-captest-new' or 'rds-aurora'"
|
||||
echo 2>&1 "Unknown PLATFORM=${PLATFORM}. Allowed only 'neon-captest-reuse', 'neon-captest-new', 'neon-captest-prefetch' or 'rds-aurora'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -196,6 +210,14 @@ jobs:
|
||||
env:
|
||||
PLATFORM: ${{ matrix.platform }}
|
||||
|
||||
- name: Set database options
|
||||
if: matrix.platform == 'neon-captest-prefetch'
|
||||
run: |
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET enable_seqscan_prefetch=on"
|
||||
psql ${BENCHMARK_CONNSTR} -c "ALTER DATABASE main SET seqscan_prefetch_buffers=10"
|
||||
env:
|
||||
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
|
||||
|
||||
- name: Benchmark init
|
||||
uses: ./.github/actions/run-python-test-set
|
||||
with:
|
||||
@@ -239,13 +261,14 @@ jobs:
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
|
||||
- name: Create Allure report
|
||||
if: always()
|
||||
uses: ./.github/actions/allure-report
|
||||
with:
|
||||
action: generate
|
||||
build_type: ${{ env.BUILD_TYPE }}
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ matrix.platform == 'neon-captest-new' && always() }}
|
||||
if: ${{ matrix.platform != 'neon-captest-reuse' && always() }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
environment: dev
|
||||
|
||||
65
.github/workflows/build_and_test.yml
vendored
65
.github/workflows/build_and_test.yml
vendored
@@ -94,15 +94,17 @@ jobs:
|
||||
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
|
||||
# because "cargo metadata" doesn't accept --release or --debug options
|
||||
#
|
||||
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
|
||||
# corresponding Cargo.toml files for their descriptions.
|
||||
- name: Set env variables
|
||||
run: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
|
||||
CARGO_FEATURES=""
|
||||
CARGO_FLAGS="--locked --timings"
|
||||
CARGO_FEATURES="--features testing"
|
||||
CARGO_FLAGS="--locked --timings $CARGO_FEATURES"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=""
|
||||
CARGO_FEATURES="--features profiling"
|
||||
CARGO_FEATURES="--features testing,profiling"
|
||||
CARGO_FLAGS="--locked --timings --release $CARGO_FEATURES"
|
||||
fi
|
||||
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
|
||||
@@ -125,8 +127,8 @@ jobs:
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-
|
||||
|
||||
- name: Cache postgres v14 build
|
||||
id: cache_pg_14
|
||||
@@ -158,7 +160,7 @@ jobs:
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
${cov_prefix} mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
- name: Run cargo test
|
||||
@@ -266,6 +268,32 @@ jobs:
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
upload-latest-artifacts:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
needs: [ regress-tests ]
|
||||
if: github.ref_name == 'main'
|
||||
steps:
|
||||
- name: Copy Neon artifact to the latest directory
|
||||
shell: bash -euxo pipefail {0}
|
||||
env:
|
||||
BUCKET: neon-github-public-dev
|
||||
PREFIX: artifacts/${{ github.run_id }}
|
||||
run: |
|
||||
for build_type in debug release; do
|
||||
FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
|
||||
|
||||
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
|
||||
if [ -z "${S3_KEY}" ]; then
|
||||
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${FILENAME} nor its version from previous attempts exist"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/artifacts/latest/${FILENAME}
|
||||
done
|
||||
|
||||
benchmarks:
|
||||
runs-on: dev
|
||||
container:
|
||||
@@ -290,7 +318,7 @@ jobs:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
save_perf_report: ${{ github.ref == 'refs/heads/main' }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
@@ -322,6 +350,7 @@ jobs:
|
||||
build_type: ${{ matrix.build_type }}
|
||||
|
||||
- name: Store Allure test stat in the DB
|
||||
if: ${{ steps.create-allure-report.outputs.report-url }}
|
||||
env:
|
||||
BUILD_TYPE: ${{ matrix.build_type }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
@@ -332,9 +361,6 @@ jobs:
|
||||
curl --fail --output suites.json ${REPORT_URL%/index.html}/data/suites.json
|
||||
./scripts/pysync
|
||||
|
||||
# Workaround for https://github.com/neondatabase/cloud/issues/2188
|
||||
psql "$TEST_RESULT_CONNSTR" -c "SELECT 1;" || sleep 10
|
||||
|
||||
DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
|
||||
|
||||
coverage-report:
|
||||
@@ -363,7 +389,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v8-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
key: v9-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
@@ -585,7 +611,16 @@ jobs:
|
||||
- name: Pull rust image from ECR
|
||||
run: crane pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned rust
|
||||
|
||||
- name: Configure docker login
|
||||
- name: Push images to production ECR
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
run: |
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/neon:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-tools:latest
|
||||
crane copy 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node:$GITHUB_RUN_ID 093970136003.dkr.ecr.us-east-2.amazonaws.com/compute-node:latest
|
||||
|
||||
- name: Configure Docker Hub login
|
||||
run: |
|
||||
# ECR Credential Helper & Docker Hub don't work together in config, hence reset
|
||||
echo "" > /github/home/.docker/config.json
|
||||
@@ -606,7 +641,7 @@ jobs:
|
||||
- name: Push rust image to Docker Hub
|
||||
run: crane push rust neondatabase/rust:pinned
|
||||
|
||||
- name: Add latest tag to images
|
||||
- name: Add latest tag to images in Docker Hub
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
@@ -733,5 +768,5 @@ jobs:
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.tag.outputs.build-tag}}
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
|
||||
24
.github/workflows/codestyle.yml
vendored
24
.github/workflows/codestyle.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
# this is all we need to install our toolchain later via rust-toolchain.toml
|
||||
# so don't install any toolchain explicitly.
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
timeout-minutes: 60
|
||||
timeout-minutes: 90
|
||||
name: check codestyle rust and postgres
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
@@ -106,7 +106,7 @@ jobs:
|
||||
!~/.cargo/registry/src
|
||||
~/.cargo/git
|
||||
target
|
||||
key: v4-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
key: v5-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
@@ -114,6 +114,26 @@ jobs:
|
||||
- name: Ensure all project builds
|
||||
run: cargo build --locked --all --all-targets
|
||||
|
||||
check-rust-dependencies:
|
||||
runs-on: dev
|
||||
container:
|
||||
image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
|
||||
- name: Check every project module is covered by Hakari
|
||||
run: |
|
||||
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
|
||||
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
steps:
|
||||
|
||||
4
.github/workflows/pg_clients.yml
vendored
4
.github/workflows/pg_clients.yml
vendored
@@ -58,12 +58,12 @@ jobs:
|
||||
env:
|
||||
REMOTE_ENV: 1
|
||||
BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install/v14
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
shell: bash -euxo pipefail {0}
|
||||
run: |
|
||||
# Test framework expects we have psql binary;
|
||||
# but since we don't really need it in this test, let's mock it
|
||||
mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
|
||||
mkdir -p "$POSTGRES_DISTRIB_DIR/v14/bin" && touch "$POSTGRES_DISTRIB_DIR/v14/bin/psql";
|
||||
./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--tb=short \
|
||||
|
||||
363
Cargo.lock
generated
363
Cargo.lock
generated
@@ -37,6 +37,12 @@ dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "amplify_num"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f27d3d00d3d115395a7a8a4dc045feb7aa82b641e485f7e15f4e67ac16f4f56d"
|
||||
|
||||
[[package]]
|
||||
name = "ansi_term"
|
||||
version = "0.12.1"
|
||||
@@ -135,6 +141,15 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic-polyfill"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c041a8d9751a520ee19656232a18971f18946a7900f1520ee4400002244dd89"
|
||||
dependencies = [
|
||||
"critical-section",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atty"
|
||||
version = "0.2.14"
|
||||
@@ -212,6 +227,21 @@ dependencies = [
|
||||
"rustc-demangle",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bare-metal"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5deb64efa5bd81e31fcd1938615a6d98c82eafcbcd787162b6f63b91d6bac5b3"
|
||||
dependencies = [
|
||||
"rustc_version 0.2.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bare-metal"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8fe8f5a8a398345e52358e18ff07cc17a568fbca5c6f73873d3a62056309603"
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.13.0"
|
||||
@@ -229,14 +259,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.59.2"
|
||||
version = "0.60.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2bd2a9a458e8f4304c52c43ebb0cfbd520289f8379a52e329a38afda99bf8eb8"
|
||||
checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"clap 2.34.0",
|
||||
"clap 3.2.16",
|
||||
"env_logger",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
@@ -250,6 +280,18 @@ dependencies = [
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bit_field"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4"
|
||||
|
||||
[[package]]
|
||||
name = "bitfield"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "46afbd2983a5d5a7bd740ccb198caf5b82f45c40c09c0eed36052d91cb92e719"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
@@ -377,13 +419,9 @@ version = "2.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
|
||||
dependencies = [
|
||||
"ansi_term",
|
||||
"atty",
|
||||
"bitflags",
|
||||
"strsim 0.8.0",
|
||||
"textwrap 0.11.0",
|
||||
"unicode-width",
|
||||
"vec_map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -396,7 +434,7 @@ dependencies = [
|
||||
"bitflags",
|
||||
"clap_lex",
|
||||
"indexmap",
|
||||
"strsim 0.10.0",
|
||||
"strsim",
|
||||
"termcolor",
|
||||
"textwrap 0.15.0",
|
||||
]
|
||||
@@ -459,8 +497,10 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap 3.2.16",
|
||||
"env_logger",
|
||||
"futures",
|
||||
"hyper",
|
||||
"log",
|
||||
"notify",
|
||||
"postgres",
|
||||
"regex",
|
||||
"serde",
|
||||
@@ -502,11 +542,11 @@ dependencies = [
|
||||
"git-version",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"pageserver",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"safekeeper",
|
||||
"safekeeper_api",
|
||||
"serde",
|
||||
"serde_with",
|
||||
"tar",
|
||||
@@ -532,6 +572,18 @@ version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc"
|
||||
|
||||
[[package]]
|
||||
name = "cortex-m"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70858629a458fdfd39f9675c4dc309411f2a3f83bede76988d81bf1a0ecee9e0"
|
||||
dependencies = [
|
||||
"bare-metal 0.2.5",
|
||||
"bitfield",
|
||||
"embedded-hal",
|
||||
"volatile-register",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cpp_demangle"
|
||||
version = "0.3.5"
|
||||
@@ -556,7 +608,7 @@ version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3dfea2db42e9927a3845fb268a10a72faed6d416065f77873f05e411457c363e"
|
||||
dependencies = [
|
||||
"rustc_version",
|
||||
"rustc_version 0.4.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -604,6 +656,18 @@ dependencies = [
|
||||
"itertools",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "critical-section"
|
||||
version = "0.2.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95da181745b56d4bd339530ec393508910c909c784e8962d15d722bacf0bcbcd"
|
||||
dependencies = [
|
||||
"bare-metal 1.0.0",
|
||||
"cfg-if",
|
||||
"cortex-m",
|
||||
"riscv",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.6"
|
||||
@@ -746,7 +810,7 @@ dependencies = [
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim 0.10.0",
|
||||
"strsim",
|
||||
"syn",
|
||||
]
|
||||
|
||||
@@ -848,6 +912,16 @@ version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f107b87b6afc2a64fd13cac55fe06d6c8859f12d4b14cbcdd2c67d0976781be"
|
||||
|
||||
[[package]]
|
||||
name = "embedded-hal"
|
||||
version = "0.2.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "35949884794ad573cf46071e41c9b60efb0cb311e3ca01f7af807af1debc66ff"
|
||||
dependencies = [
|
||||
"nb 0.1.3",
|
||||
"void",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.31"
|
||||
@@ -1000,6 +1074,15 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fsevent-sys"
|
||||
version = "4.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.21"
|
||||
@@ -1169,6 +1252,15 @@ version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
|
||||
|
||||
[[package]]
|
||||
name = "hash32"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
@@ -1178,6 +1270,19 @@ dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heapless"
|
||||
version = "0.7.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
|
||||
dependencies = [
|
||||
"atomic-polyfill",
|
||||
"hash32",
|
||||
"rustc_version 0.4.0",
|
||||
"spin 0.9.4",
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.3.3"
|
||||
@@ -1399,6 +1504,26 @@ dependencies = [
|
||||
"str_stack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify"
|
||||
version = "0.9.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"inotify-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inotify-sys"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "instant"
|
||||
version = "0.1.12"
|
||||
@@ -1458,6 +1583,26 @@ dependencies = [
|
||||
"simple_asn1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kqueue"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4d6112e8f37b59803ac47a42d14f1f3a59bbf72fc6857ffc5be455e28a691f8e"
|
||||
dependencies = [
|
||||
"kqueue-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kqueue-sys"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8367585489f01bc55dd27404dcf56b95e6da061a256a666ab23be9ba96a2e587"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kstring"
|
||||
version = "1.0.6"
|
||||
@@ -1495,6 +1640,12 @@ dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "292a948cd991e376cf75541fe5b97a1081d713c618b4f1b9500f8844e49eb565"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.7"
|
||||
@@ -1653,6 +1804,21 @@ dependencies = [
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nb"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "801d31da0513b6ec5214e9bf433a77966320625a37860f910be265be6e18d06f"
|
||||
dependencies = [
|
||||
"nb 1.0.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nb"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "546c37ac5d9e56f55e73b677106873d9d9f5190605e41a856503623648488cae"
|
||||
|
||||
[[package]]
|
||||
name = "nix"
|
||||
version = "0.23.1"
|
||||
@@ -1682,6 +1848,24 @@ dependencies = [
|
||||
"minimal-lexical",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "notify"
|
||||
version = "5.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed2c66da08abae1c024c01d635253e402341b4060a12e99b31c7594063bf490a"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"crossbeam-channel",
|
||||
"filetime",
|
||||
"fsevent-sys",
|
||||
"inotify",
|
||||
"kqueue",
|
||||
"libc",
|
||||
"mio",
|
||||
"walkdir",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-bigint"
|
||||
version = "0.4.3"
|
||||
@@ -1720,6 +1904,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1832,6 +2017,7 @@ checksum = "648001efe5d5c0102d8cea768e348da85d90af8ba91f0bea908f157951493cd4"
|
||||
name = "pageserver"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"amplify_num",
|
||||
"anyhow",
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -1856,7 +2042,9 @@ dependencies = [
|
||||
"itertools",
|
||||
"metrics",
|
||||
"nix",
|
||||
"num-traits",
|
||||
"once_cell",
|
||||
"pageserver_api",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
@@ -1865,6 +2053,7 @@ dependencies = [
|
||||
"rand",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"rstar",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -1885,6 +2074,17 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pageserver_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"const_format",
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.11.2"
|
||||
@@ -2288,6 +2488,7 @@ dependencies = [
|
||||
"tokio-rustls",
|
||||
"url",
|
||||
"utils",
|
||||
"uuid",
|
||||
"workspace_hack",
|
||||
"x509-parser",
|
||||
]
|
||||
@@ -2449,6 +2650,7 @@ dependencies = [
|
||||
"tokio-util",
|
||||
"toml_edit",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -2518,12 +2720,33 @@ dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"spin",
|
||||
"spin 0.5.2",
|
||||
"untrusted",
|
||||
"web-sys",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "riscv"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6907ccdd7a31012b70faf2af85cd9e5ba97657cc3987c4f13f8e4d2c2a088aba"
|
||||
dependencies = [
|
||||
"bare-metal 1.0.0",
|
||||
"bit_field",
|
||||
"riscv-target",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "riscv-target"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "88aa938cda42a0cf62a20cfe8d139ff1af20c2e681212b5b34adb5a58333f222"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "routerify"
|
||||
version = "3.0.0"
|
||||
@@ -2537,6 +2760,17 @@ dependencies = [
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rstar"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
|
||||
dependencies = [
|
||||
"heapless",
|
||||
"num-traits",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rstest"
|
||||
version = "0.12.0"
|
||||
@@ -2546,7 +2780,7 @@ dependencies = [
|
||||
"cfg-if",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustc_version",
|
||||
"rustc_version 0.4.0",
|
||||
"syn",
|
||||
]
|
||||
|
||||
@@ -2568,7 +2802,7 @@ dependencies = [
|
||||
"log",
|
||||
"rusoto_credential",
|
||||
"rusoto_signature",
|
||||
"rustc_version",
|
||||
"rustc_version 0.4.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tokio",
|
||||
@@ -2626,7 +2860,7 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rusoto_credential",
|
||||
"rustc_version",
|
||||
"rustc_version 0.4.0",
|
||||
"serde",
|
||||
"sha2 0.9.9",
|
||||
"tokio",
|
||||
@@ -2644,13 +2878,22 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
|
||||
dependencies = [
|
||||
"semver 0.9.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
|
||||
dependencies = [
|
||||
"semver",
|
||||
"semver 1.0.13",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -2724,16 +2967,19 @@ dependencies = [
|
||||
"hyper",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres_ffi",
|
||||
"regex",
|
||||
"remote_storage",
|
||||
"safekeeper_api",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_with",
|
||||
"signal-hook",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"toml_edit",
|
||||
@@ -2743,6 +2989,17 @@ dependencies = [
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "safekeeper_api"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"const_format",
|
||||
"serde",
|
||||
"serde_with",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
@@ -2801,12 +3058,27 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
||||
dependencies = [
|
||||
"semver-parser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93f6841e709003d68bb2deee8c343572bf446003ec20a583e76f7b15cebf3711"
|
||||
|
||||
[[package]]
|
||||
name = "semver-parser"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.142"
|
||||
@@ -3000,6 +3272,15 @@ version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
||||
|
||||
[[package]]
|
||||
name = "spin"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
@@ -3022,12 +3303,6 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.10.0"
|
||||
@@ -3677,6 +3952,10 @@ name = "uuid"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc5cf98d8186244414c848017f0e2676b3fcb46807f6668a97dfe67359a3c4b7"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "valuable"
|
||||
@@ -3684,24 +3963,39 @@ version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||
|
||||
[[package]]
|
||||
name = "vcell"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77439c1b53d2303b20d9459b1ade71a83c716e3f9c34f3228c00e6f185d6c002"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "vec_map"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "void"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d"
|
||||
|
||||
[[package]]
|
||||
name = "volatile-register"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ee8f19f9d74293faf70901bc20ad067dc1ad390d2cbf1e3f75f721ffee908b6"
|
||||
dependencies = [
|
||||
"vcell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wal_craft"
|
||||
version = "0.1.0"
|
||||
@@ -3714,6 +4008,7 @@ dependencies = [
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
"tempfile",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -3947,16 +4242,10 @@ dependencies = [
|
||||
"bstr",
|
||||
"bytes",
|
||||
"chrono",
|
||||
"clap 2.34.0",
|
||||
"crossbeam-utils",
|
||||
"either",
|
||||
"fail",
|
||||
"futures-channel",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
"generic-array",
|
||||
"hashbrown",
|
||||
"hex",
|
||||
"hyper",
|
||||
"indexmap",
|
||||
"itoa 0.4.8",
|
||||
"libc",
|
||||
@@ -3973,12 +4262,14 @@ dependencies = [
|
||||
"regex-syntax",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
"syn",
|
||||
"time 0.3.12",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
14
Dockerfile
14
Dockerfile
@@ -14,13 +14,13 @@ COPY --chown=nonroot vendor/postgres-v14 vendor/postgres-v14
|
||||
COPY --chown=nonroot vendor/postgres-v15 vendor/postgres-v15
|
||||
COPY --chown=nonroot pgxn pgxn
|
||||
COPY --chown=nonroot Makefile Makefile
|
||||
COPY --chown=nonroot scripts/ninstall.sh scripts/ninstall.sh
|
||||
|
||||
ENV BUILD_TYPE release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s neon-pg-ext \
|
||||
&& rm -rf pg_install/v14/build \
|
||||
&& rm -rf pg_install/v15/build \
|
||||
&& tar -C pg_install/v14 -czf /home/nonroot/postgres_install.tar.gz .
|
||||
&& rm -rf pg_install/build \
|
||||
&& tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .
|
||||
|
||||
# Build neon binaries
|
||||
FROM $REPOSITORY/$IMAGE:$TAG AS build
|
||||
@@ -44,7 +44,7 @@ COPY . .
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
|
||||
RUN set -e \
|
||||
&& mold -run cargo build --locked --release \
|
||||
&& mold -run cargo build --bin pageserver --bin safekeeper --bin proxy --locked --release \
|
||||
&& cachepot -s
|
||||
|
||||
# Build final image
|
||||
@@ -67,8 +67,8 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver /usr
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin
|
||||
|
||||
# v14 is default for now
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
|
||||
COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
|
||||
COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
|
||||
|
||||
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
|
||||
@@ -77,7 +77,7 @@ RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
|
||||
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
|
||||
-c "id=1234" \
|
||||
-c "broker_endpoints=['http://etcd:2379']" \
|
||||
-c "pg_distrib_dir='/usr/local'" \
|
||||
-c "pg_distrib_dir='/usr/local/'" \
|
||||
-c "listen_pg_addr='0.0.0.0:6400'" \
|
||||
-c "listen_http_addr='0.0.0.0:9898'"
|
||||
|
||||
|
||||
@@ -8,9 +8,12 @@ ARG TAG=pinned
|
||||
# Layer "build-deps"
|
||||
#
|
||||
FROM debian:bullseye-slim AS build-deps
|
||||
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update
|
||||
RUN apt update && \
|
||||
apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
|
||||
libcurl4-openssl-dev libossp-uuid-dev
|
||||
libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
|
||||
|
||||
#
|
||||
# Layer "pg-build"
|
||||
@@ -37,7 +40,7 @@ RUN cd postgres && \
|
||||
FROM build-deps AS postgis-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc wget
|
||||
apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
|
||||
|
||||
RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
|
||||
tar xvzf postgis-3.3.0.tar.gz && \
|
||||
@@ -59,15 +62,13 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
|
||||
# Build plv8
|
||||
#
|
||||
FROM build-deps AS plv8-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
RUN apt update && \
|
||||
apt install -y git curl wget make ninja-build build-essential libncurses5 python3-dev pkg-config libc++-dev libc++abi-dev libglib2.0-dev
|
||||
apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
|
||||
|
||||
# https://github.com/plv8/plv8/issues/475
|
||||
# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
|
||||
RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
|
||||
echo "APT::Default-Release \"stable\";" > /etc/apt/apt.conf.d/default-release && \
|
||||
apt update && \
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing binutils
|
||||
|
||||
RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
@@ -79,12 +80,46 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
|
||||
rm -rf /plv8-* && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
|
||||
|
||||
#
|
||||
# Layer "h3-pg-build"
|
||||
# Build h3_pg
|
||||
#
|
||||
FROM build-deps AS h3-pg-build
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# packaged cmake is too old
|
||||
RUN apt update && \
|
||||
apt install -y --no-install-recommends -t testing cmake
|
||||
|
||||
RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
|
||||
tar xvzf h3.tgz && \
|
||||
cd h3-4.0.1 && \
|
||||
mkdir build && \
|
||||
cd build && \
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
DESTDIR=/h3 make install && \
|
||||
cp -R /h3/usr / && \
|
||||
rm -rf build
|
||||
|
||||
RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
|
||||
tar xvzf h3-pg.tgz && \
|
||||
cd h3-pg-4.0.1 && \
|
||||
export PATH="/usr/local/pgsql/bin:$PATH" && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
|
||||
|
||||
#
|
||||
# Layer "neon-pg-ext-build"
|
||||
# compile neon extensions
|
||||
#
|
||||
FROM build-deps AS neon-pg-ext-build
|
||||
COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
# plv8 still sometimes crashes during the creation
|
||||
# COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=h3-pg-build /h3/usr /
|
||||
COPY pgxn/ pgxn/
|
||||
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) \
|
||||
@@ -132,8 +167,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
|
||||
chmod 0750 /var/db/postgres/compute && \
|
||||
echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
|
||||
|
||||
# TODO: Check if we can make the extension setup more modular versus a linear build
|
||||
# currently plv8-build copies the output /usr/local/pgsql from postgis-build, etc#
|
||||
COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
|
||||
COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
|
||||
2
Makefile
2
Makefile
@@ -38,7 +38,7 @@ endif
|
||||
# headers, the mtime of the headers are not changed when there have
|
||||
# been no changes to the files. Changing the mtime triggers an
|
||||
# unnecessary rebuild of 'postgres_ffi'.
|
||||
PG_CONFIGURE_OPTS += INSTALL='install -C'
|
||||
PG_CONFIGURE_OPTS += INSTALL='$(ROOT_PROJECT_DIR)/scripts/ninstall.sh -C'
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||
|
||||
@@ -222,7 +222,12 @@ Ensure your dependencies are installed as described [here](https://github.com/ne
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
make # builds also postgres and installs it to ./pg_install
|
||||
|
||||
# either:
|
||||
CARGO_BUILD_FLAGS="--features=testing" make
|
||||
# or:
|
||||
make debug
|
||||
|
||||
./scripts/pytest
|
||||
```
|
||||
|
||||
|
||||
@@ -8,8 +8,10 @@ anyhow = "1.0"
|
||||
chrono = "0.4"
|
||||
clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
futures = "0.3.13"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
log = { version = "0.4", features = ["std", "serde"] }
|
||||
notify = "5.0.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
regex = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
|
||||
@@ -258,14 +258,7 @@ impl ComputeNode {
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Try default Postgres port if it is not provided
|
||||
let port = self
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("port")
|
||||
.unwrap_or_else(|| "5432".to_string());
|
||||
wait_for_postgres(&mut pg, &port, pgdata_path)?;
|
||||
wait_for_postgres(&mut pg, pgdata_path)?;
|
||||
|
||||
// If connection fails,
|
||||
// it may be the old node with `zenith_admin` superuser.
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
use std::fmt::Write;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Child;
|
||||
use std::str::FromStr;
|
||||
use std::{fs, thread, time};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use postgres::{Client, Transaction};
|
||||
use serde::Deserialize;
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: u64 = 60 * 1000; // milliseconds
|
||||
use notify::{RecursiveMode, Watcher};
|
||||
|
||||
const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
|
||||
|
||||
/// Rust representation of Postgres role info with only those fields
|
||||
/// that matter for us.
|
||||
@@ -230,52 +231,85 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
||||
Ok(postgres_dbs)
|
||||
}
|
||||
|
||||
/// Wait for Postgres to become ready to accept connections:
|
||||
/// - state should be `ready` in the `pgdata/postmaster.pid`
|
||||
/// - and we should be able to connect to 127.0.0.1:5432
|
||||
pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
|
||||
/// Wait for Postgres to become ready to accept connections. It's ready to
|
||||
/// accept connections when the state-field in `pgdata/postmaster.pid` says
|
||||
/// 'ready'.
|
||||
pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
|
||||
let pid_path = pgdata.join("postmaster.pid");
|
||||
let mut slept: u64 = 0; // ms
|
||||
let pause = time::Duration::from_millis(100);
|
||||
|
||||
let timeout = time::Duration::from_millis(10);
|
||||
let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
|
||||
// PostgreSQL writes line "ready" to the postmaster.pid file, when it has
|
||||
// completed initialization and is ready to accept connections. We want to
|
||||
// react quickly and perform the rest of our initialization as soon as
|
||||
// PostgreSQL starts accepting connections. Use 'notify' to be notified
|
||||
// whenever the PID file is changed, and whenever it changes, read it to
|
||||
// check if it's now "ready".
|
||||
//
|
||||
// You cannot actually watch a file before it exists, so we first watch the
|
||||
// data directory, and once the postmaster.pid file appears, we switch to
|
||||
// watch the file instead. We also wake up every 100 ms to poll, just in
|
||||
// case we miss some events for some reason. Not strictly necessary, but
|
||||
// better safe than sorry.
|
||||
let (tx, rx) = std::sync::mpsc::channel();
|
||||
let mut watcher = notify::recommended_watcher(move |res| {
|
||||
let _ = tx.send(res);
|
||||
})?;
|
||||
watcher.watch(pgdata, RecursiveMode::NonRecursive)?;
|
||||
|
||||
let started_at = Instant::now();
|
||||
let mut postmaster_pid_seen = false;
|
||||
loop {
|
||||
// Sleep POSTGRES_WAIT_TIMEOUT at max (a bit longer actually if consider a TCP timeout,
|
||||
// but postgres starts listening almost immediately, even if it is not really
|
||||
// ready to accept connections).
|
||||
if slept >= POSTGRES_WAIT_TIMEOUT {
|
||||
bail!("timed out while waiting for Postgres to start");
|
||||
}
|
||||
|
||||
if let Ok(Some(status)) = pg.try_wait() {
|
||||
// Postgres exited, that is not what we expected, bail out earlier.
|
||||
let code = status.code().unwrap_or(-1);
|
||||
bail!("Postgres exited unexpectedly with code {}", code);
|
||||
}
|
||||
|
||||
let res = rx.recv_timeout(Duration::from_millis(100));
|
||||
log::debug!("woken up by notify: {res:?}");
|
||||
// If there are multiple events in the channel already, we only need to be
|
||||
// check once. Swallow the extra events before we go ahead to check the
|
||||
// pid file.
|
||||
while let Ok(res) = rx.try_recv() {
|
||||
log::debug!("swallowing extra event: {res:?}");
|
||||
}
|
||||
|
||||
// Check that we can open pid file first.
|
||||
if let Ok(file) = File::open(&pid_path) {
|
||||
if !postmaster_pid_seen {
|
||||
log::debug!("postmaster.pid appeared");
|
||||
watcher
|
||||
.unwatch(pgdata)
|
||||
.expect("Failed to remove pgdata dir watch");
|
||||
watcher
|
||||
.watch(&pid_path, RecursiveMode::NonRecursive)
|
||||
.expect("Failed to add postmaster.pid file watch");
|
||||
postmaster_pid_seen = true;
|
||||
}
|
||||
|
||||
let file = BufReader::new(file);
|
||||
let last_line = file.lines().last();
|
||||
|
||||
// Pid file could be there and we could read it, but it could be empty, for example.
|
||||
if let Some(Ok(line)) = last_line {
|
||||
let status = line.trim();
|
||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
||||
log::debug!("last line of postmaster.pid: {status:?}");
|
||||
|
||||
// Now Postgres is ready to accept connections
|
||||
if status == "ready" && can_connect {
|
||||
if status == "ready" {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
thread::sleep(pause);
|
||||
slept += 100;
|
||||
// Give up after POSTGRES_WAIT_TIMEOUT.
|
||||
let duration = started_at.elapsed();
|
||||
if duration >= POSTGRES_WAIT_TIMEOUT {
|
||||
bail!("timed out while waiting for Postgres to start");
|
||||
}
|
||||
}
|
||||
|
||||
log::info!("PostgreSQL is now running, continuing to configure it");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -19,7 +19,9 @@ thiserror = "1"
|
||||
nix = "0.23"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
safekeeper = { path = "../safekeeper" }
|
||||
# Note: Do not directly depend on pageserver or safekeeper; use pageserver_api or safekeeper_api
|
||||
# instead, so that recompile times are better.
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
safekeeper_api = { path = "../libs/safekeeper_api" }
|
||||
utils = { path = "../libs/utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -12,12 +12,12 @@ use control_plane::local_env::{EtcdBroker, LocalEnv};
|
||||
use control_plane::safekeeper::SafekeeperNode;
|
||||
use control_plane::storage::PageServerNode;
|
||||
use control_plane::{etcd, local_env};
|
||||
use pageserver::config::defaults::{
|
||||
use pageserver_api::models::TimelineInfo;
|
||||
use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR as DEFAULT_PAGESERVER_HTTP_ADDR,
|
||||
DEFAULT_PG_LISTEN_ADDR as DEFAULT_PAGESERVER_PG_ADDR,
|
||||
};
|
||||
use pageserver::http::models::TimelineInfo;
|
||||
use safekeeper::defaults::{
|
||||
use safekeeper_api::{
|
||||
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
|
||||
DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
|
||||
};
|
||||
@@ -39,6 +39,8 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
|
||||
const DEFAULT_BRANCH_NAME: &str = "main";
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
const DEFAULT_PG_VERSION: &str = "14";
|
||||
|
||||
fn default_conf(etcd_binary_path: &Path) -> String {
|
||||
format!(
|
||||
r#"
|
||||
@@ -105,6 +107,13 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.required(false);
|
||||
|
||||
let pg_version_arg = Arg::new("pg-version")
|
||||
.long("pg-version")
|
||||
.help("Postgres version to use for the initial tenant")
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.default_value(DEFAULT_PG_VERSION);
|
||||
|
||||
let port_arg = Arg::new("port")
|
||||
.long("port")
|
||||
.required(false)
|
||||
@@ -146,6 +155,7 @@ fn main() -> Result<()> {
|
||||
.required(false)
|
||||
.value_name("config"),
|
||||
)
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
App::new("timeline")
|
||||
@@ -164,7 +174,9 @@ fn main() -> Result<()> {
|
||||
.subcommand(App::new("create")
|
||||
.about("Create a new blank timeline")
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(branch_name_arg.clone()))
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("import")
|
||||
.about("Import timeline from basebackup directory")
|
||||
.arg(tenant_id_arg.clone())
|
||||
@@ -178,7 +190,9 @@ fn main() -> Result<()> {
|
||||
.arg(Arg::new("wal-tarfile").long("wal-tarfile").takes_value(true)
|
||||
.help("Wal to add after base"))
|
||||
.arg(Arg::new("end-lsn").long("end-lsn").takes_value(true)
|
||||
.help("Lsn the basebackup ends at")))
|
||||
.help("Lsn the basebackup ends at"))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
).subcommand(
|
||||
App::new("tenant")
|
||||
.setting(AppSettings::ArgRequiredElseHelp)
|
||||
@@ -188,6 +202,7 @@ fn main() -> Result<()> {
|
||||
.arg(tenant_id_arg.clone())
|
||||
.arg(timeline_id_arg.clone().help("Use a specific timeline id when creating a tenant and its initial timeline"))
|
||||
.arg(Arg::new("config").short('c').takes_value(true).multiple_occurrences(true).required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("config")
|
||||
.arg(tenant_id_arg.clone())
|
||||
@@ -239,8 +254,9 @@ fn main() -> Result<()> {
|
||||
Arg::new("config-only")
|
||||
.help("Don't do basebackup, create compute node with only config files")
|
||||
.long("config-only")
|
||||
.required(false)
|
||||
))
|
||||
.required(false))
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(App::new("start")
|
||||
.about("Start a postgres compute node.\n This command actually creates new node from scratch, but preserves existing config files")
|
||||
.arg(pg_node_arg.clone())
|
||||
@@ -248,7 +264,9 @@ fn main() -> Result<()> {
|
||||
.arg(branch_name_arg.clone())
|
||||
.arg(timeline_id_arg.clone())
|
||||
.arg(lsn_arg.clone())
|
||||
.arg(port_arg.clone()))
|
||||
.arg(port_arg.clone())
|
||||
.arg(pg_version_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
App::new("stop")
|
||||
.arg(pg_node_arg.clone())
|
||||
@@ -501,9 +519,16 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
default_conf(&EtcdBroker::locate_etcd()?)
|
||||
};
|
||||
|
||||
let pg_version = init_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let mut env =
|
||||
LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
|
||||
env.init().context("Failed to initialize neon repository")?;
|
||||
env.init(pg_version)
|
||||
.context("Failed to initialize neon repository")?;
|
||||
let initial_tenant_id = env
|
||||
.default_tenant_id
|
||||
.expect("default_tenant_id should be generated by the `env.init()` call above");
|
||||
@@ -515,6 +540,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
|
||||
Some(initial_tenant_id),
|
||||
initial_timeline_id_arg,
|
||||
&pageserver_config_overrides(init_match),
|
||||
pg_version,
|
||||
)
|
||||
.unwrap_or_else(|e| {
|
||||
eprintln!("pageserver init failed: {e}");
|
||||
@@ -557,8 +583,19 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
|
||||
// Create an initial timeline for the new tenant
|
||||
let new_timeline_id = parse_timeline_id(create_match)?;
|
||||
let timeline_info =
|
||||
pageserver.timeline_create(new_tenant_id, new_timeline_id, None, None)?;
|
||||
let pg_version = create_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info = pageserver.timeline_create(
|
||||
new_tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
let last_record_lsn = timeline_info
|
||||
.local
|
||||
@@ -607,7 +644,15 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
let new_branch_name = create_match
|
||||
.value_of("branch-name")
|
||||
.ok_or_else(|| anyhow!("No branch name provided"))?;
|
||||
let timeline_info = pageserver.timeline_create(tenant_id, None, None, None)?;
|
||||
|
||||
let pg_version = create_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let timeline_info =
|
||||
pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
let last_record_lsn = timeline_info
|
||||
@@ -650,12 +695,19 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
// TODO validate both or none are provided
|
||||
let pg_wal = end_lsn.zip(wal_tarfile);
|
||||
|
||||
let pg_version = import_match
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
let mut cplane = ComputeControlPlane::load(env.clone())?;
|
||||
println!("Importing timeline into pageserver ...");
|
||||
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal)?;
|
||||
pageserver.timeline_import(tenant_id, timeline_id, base, pg_wal, pg_version)?;
|
||||
println!("Creating node for imported timeline ...");
|
||||
env.register_branch_mapping(name.to_string(), tenant_id, timeline_id)?;
|
||||
cplane.new_node(tenant_id, name, timeline_id, None, None)?;
|
||||
|
||||
cplane.new_node(tenant_id, name, timeline_id, None, None, pg_version)?;
|
||||
println!("Done");
|
||||
}
|
||||
Some(("branch", branch_match)) => {
|
||||
@@ -682,6 +734,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
|
||||
None,
|
||||
start_lsn,
|
||||
Some(ancestor_timeline_id),
|
||||
None,
|
||||
)?;
|
||||
let new_timeline_id = timeline_info.timeline_id;
|
||||
|
||||
@@ -797,7 +850,14 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
Some(p) => Some(p.parse()?),
|
||||
None => None,
|
||||
};
|
||||
cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port)?;
|
||||
|
||||
let pg_version = sub_args
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
|
||||
cplane.new_node(tenant_id, &node_name, timeline_id, lsn, port, pg_version)?;
|
||||
}
|
||||
"start" => {
|
||||
let port: Option<u16> = match sub_args.value_of("port") {
|
||||
@@ -835,16 +895,23 @@ fn handle_pg(pg_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
.map(Lsn::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse Lsn from the request")?;
|
||||
let pg_version = sub_args
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?;
|
||||
// when used with custom port this results in non obvious behaviour
|
||||
// port is remembered from first start command, i e
|
||||
// start --port X
|
||||
// stop
|
||||
// start <-- will also use port X even without explicit port argument
|
||||
println!(
|
||||
"Starting new postgres {} on timeline {} ...",
|
||||
node_name, timeline_id
|
||||
"Starting new postgres (v{}) {} on timeline {} ...",
|
||||
pg_version, node_name, timeline_id
|
||||
);
|
||||
let node = cplane.new_node(tenant_id, node_name, timeline_id, lsn, port)?;
|
||||
|
||||
let node =
|
||||
cplane.new_node(tenant_id, node_name, timeline_id, lsn, port, pg_version)?;
|
||||
node.start(&auth_token)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,7 @@ use utils::{
|
||||
postgres_backend::AuthType,
|
||||
};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::local_env::{LocalEnv, DEFAULT_PG_VERSION};
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
use crate::storage::PageServerNode;
|
||||
|
||||
@@ -81,6 +81,7 @@ impl ComputeControlPlane {
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
port: Option<u16>,
|
||||
pg_version: u32,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
let port = port.unwrap_or_else(|| self.get_port());
|
||||
let node = Arc::new(PostgresNode {
|
||||
@@ -93,6 +94,7 @@ impl ComputeControlPlane {
|
||||
lsn,
|
||||
tenant_id,
|
||||
uses_wal_proposer: false,
|
||||
pg_version,
|
||||
});
|
||||
|
||||
node.create_pgdata()?;
|
||||
@@ -118,6 +120,7 @@ pub struct PostgresNode {
|
||||
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
|
||||
pub tenant_id: TenantId,
|
||||
uses_wal_proposer: bool,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
impl PostgresNode {
|
||||
@@ -152,6 +155,14 @@ impl PostgresNode {
|
||||
let tenant_id: TenantId = conf.parse_field("neon.tenant_id", &context)?;
|
||||
let uses_wal_proposer = conf.get("neon.safekeepers").is_some();
|
||||
|
||||
// Read postgres version from PG_VERSION file to determine which postgres version binary to use.
|
||||
// If it doesn't exist, assume broken data directory and use default pg version.
|
||||
let pg_version_path = entry.path().join("PG_VERSION");
|
||||
|
||||
let pg_version_str =
|
||||
fs::read_to_string(pg_version_path).unwrap_or_else(|_| DEFAULT_PG_VERSION.to_string());
|
||||
let pg_version = u32::from_str(&pg_version_str)?;
|
||||
|
||||
// parse recovery_target_lsn, if any
|
||||
let recovery_target_lsn: Option<Lsn> =
|
||||
conf.parse_field_optional("recovery_target_lsn", &context)?;
|
||||
@@ -167,17 +178,24 @@ impl PostgresNode {
|
||||
lsn: recovery_target_lsn,
|
||||
tenant_id,
|
||||
uses_wal_proposer,
|
||||
pg_version,
|
||||
})
|
||||
}
|
||||
|
||||
fn sync_safekeepers(&self, auth_token: &Option<String>) -> Result<Lsn> {
|
||||
let pg_path = self.env.pg_bin_dir().join("postgres");
|
||||
fn sync_safekeepers(&self, auth_token: &Option<String>, pg_version: u32) -> Result<Lsn> {
|
||||
let pg_path = self.env.pg_bin_dir(pg_version).join("postgres");
|
||||
let mut cmd = Command::new(&pg_path);
|
||||
|
||||
cmd.arg("--sync-safekeepers")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env(
|
||||
"LD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(pg_version).to_str().unwrap(),
|
||||
)
|
||||
.env(
|
||||
"DYLD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(pg_version).to_str().unwrap(),
|
||||
)
|
||||
.env("PGDATA", self.pgdata().to_str().unwrap())
|
||||
.stdout(Stdio::piped())
|
||||
// Comment this to avoid capturing stderr (useful if command hangs)
|
||||
@@ -259,8 +277,8 @@ impl PostgresNode {
|
||||
})
|
||||
}
|
||||
|
||||
// Connect to a page server, get base backup, and untar it to initialize a
|
||||
// new data directory
|
||||
// Write postgresql.conf with default configuration
|
||||
// and PG_VERSION file to the data directory of a new node.
|
||||
fn setup_pg_conf(&self, auth_type: AuthType) -> Result<()> {
|
||||
let mut conf = PostgresConf::new();
|
||||
conf.append("max_wal_senders", "10");
|
||||
@@ -357,6 +375,9 @@ impl PostgresNode {
|
||||
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
|
||||
file.write_all(conf.to_string().as_bytes())?;
|
||||
|
||||
let mut file = File::create(self.pgdata().join("PG_VERSION"))?;
|
||||
file.write_all(self.pg_version.to_string().as_bytes())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -368,7 +389,7 @@ impl PostgresNode {
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_safekeepers(auth_token)?;
|
||||
let lsn = self.sync_safekeepers(auth_token, self.pg_version)?;
|
||||
if lsn == Lsn(0) {
|
||||
None
|
||||
} else {
|
||||
@@ -401,7 +422,7 @@ impl PostgresNode {
|
||||
}
|
||||
|
||||
fn pg_ctl(&self, args: &[&str], auth_token: &Option<String>) -> Result<()> {
|
||||
let pg_ctl_path = self.env.pg_bin_dir().join("pg_ctl");
|
||||
let pg_ctl_path = self.env.pg_bin_dir(self.pg_version).join("pg_ctl");
|
||||
let mut cmd = Command::new(pg_ctl_path);
|
||||
cmd.args(
|
||||
[
|
||||
@@ -417,8 +438,14 @@ impl PostgresNode {
|
||||
.concat(),
|
||||
)
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", self.env.pg_lib_dir().to_str().unwrap());
|
||||
.env(
|
||||
"LD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
|
||||
)
|
||||
.env(
|
||||
"DYLD_LIBRARY_PATH",
|
||||
self.env.pg_lib_dir(self.pg_version).to_str().unwrap(),
|
||||
);
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
}
|
||||
|
||||
@@ -20,6 +20,8 @@ use utils::{
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
|
||||
//
|
||||
// This data structures represents neon_local CLI config
|
||||
//
|
||||
@@ -195,12 +197,33 @@ impl Default for SafekeeperConf {
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
// postgres installation paths
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
pub fn pg_distrib_dir_raw(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.clone()
|
||||
}
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
|
||||
pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
|
||||
let path = self.pg_distrib_dir.clone();
|
||||
|
||||
match pg_version {
|
||||
14 => path.join(format!("v{pg_version}")),
|
||||
15 => path.join(format!("v{pg_version}")),
|
||||
_ => panic!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
|
||||
match pg_version {
|
||||
14 => self.pg_distrib_dir(pg_version).join("bin"),
|
||||
15 => self.pg_distrib_dir(pg_version).join("bin"),
|
||||
_ => panic!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
|
||||
match pg_version {
|
||||
14 => self.pg_distrib_dir(pg_version).join("lib"),
|
||||
15 => self.pg_distrib_dir(pg_version).join("lib"),
|
||||
_ => panic!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pageserver_bin(&self) -> anyhow::Result<PathBuf> {
|
||||
@@ -289,13 +312,15 @@ impl LocalEnv {
|
||||
let mut env: LocalEnv = toml::from_str(toml)?;
|
||||
|
||||
// Find postgres binaries.
|
||||
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install/v14".
|
||||
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "pg_install".
|
||||
// Note that later in the code we assume, that distrib dirs follow the same pattern
|
||||
// for all postgres versions.
|
||||
if env.pg_distrib_dir == Path::new("") {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
env.pg_distrib_dir = postgres_bin.into();
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
env.pg_distrib_dir = cwd.join("pg_install/v14")
|
||||
env.pg_distrib_dir = cwd.join("pg_install")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -384,7 +409,7 @@ impl LocalEnv {
|
||||
//
|
||||
// Initialize a new Neon repository
|
||||
//
|
||||
pub fn init(&mut self) -> anyhow::Result<()> {
|
||||
pub fn init(&mut self, pg_version: u32) -> anyhow::Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = &self.base_data_dir;
|
||||
ensure!(
|
||||
@@ -397,10 +422,10 @@ impl LocalEnv {
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.display()
|
||||
);
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
if !self.pg_bin_dir(pg_version).join("postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
self.pg_bin_dir(pg_version).display()
|
||||
);
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
|
||||
@@ -12,7 +12,7 @@ use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use safekeeper::http::models::TimelineCreateRequest;
|
||||
use safekeeper_api::models::TimelineCreateRequest;
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
|
||||
@@ -11,7 +11,7 @@ use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver::http::models::{
|
||||
use pageserver_api::models::{
|
||||
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
|
||||
};
|
||||
use postgres::{Config, NoTls};
|
||||
@@ -112,11 +112,15 @@ impl PageServerNode {
|
||||
create_tenant: Option<TenantId>,
|
||||
initial_timeline_id: Option<TimelineId>,
|
||||
config_overrides: &[&str],
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
let id = format!("id={}", self.env.pageserver.id);
|
||||
// FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
|
||||
let pg_distrib_dir_param =
|
||||
format!("pg_distrib_dir='{}'", self.env.pg_distrib_dir.display());
|
||||
let pg_distrib_dir_param = format!(
|
||||
"pg_distrib_dir='{}'",
|
||||
self.env.pg_distrib_dir_raw().display()
|
||||
);
|
||||
|
||||
let authg_type_param = format!("auth_type='{}'", self.env.pageserver.auth_type);
|
||||
let listen_http_addr_param = format!(
|
||||
"listen_http_addr='{}'",
|
||||
@@ -159,7 +163,7 @@ impl PageServerNode {
|
||||
|
||||
self.start_node(&init_config_overrides, &self.env.base_data_dir, true)?;
|
||||
let init_result = self
|
||||
.try_init_timeline(create_tenant, initial_timeline_id)
|
||||
.try_init_timeline(create_tenant, initial_timeline_id, pg_version)
|
||||
.context("Failed to create initial tenant and timeline for pageserver");
|
||||
match &init_result {
|
||||
Ok(initial_timeline_id) => {
|
||||
@@ -175,12 +179,16 @@ impl PageServerNode {
|
||||
&self,
|
||||
new_tenant_id: Option<TenantId>,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<TimelineId> {
|
||||
let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())
|
||||
.context("failed to create tenant")?;
|
||||
let initial_timeline_info =
|
||||
self.timeline_create(initial_tenant_id, new_timeline_id, None, None)
|
||||
.context("failed to create timeline")?;
|
||||
let initial_tenant_id = self.tenant_create(new_tenant_id, HashMap::new())?;
|
||||
let initial_timeline_info = self.timeline_create(
|
||||
initial_tenant_id,
|
||||
new_timeline_id,
|
||||
None,
|
||||
None,
|
||||
Some(pg_version),
|
||||
)?;
|
||||
Ok(initial_timeline_info.timeline_id)
|
||||
}
|
||||
|
||||
@@ -504,6 +512,7 @@ impl PageServerNode {
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
pg_version: Option<u32>,
|
||||
) -> anyhow::Result<TimelineInfo> {
|
||||
self.http_request(
|
||||
Method::POST,
|
||||
@@ -513,6 +522,7 @@ impl PageServerNode {
|
||||
new_timeline_id,
|
||||
ancestor_start_lsn,
|
||||
ancestor_timeline_id,
|
||||
pg_version,
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
@@ -542,6 +552,7 @@ impl PageServerNode {
|
||||
timeline_id: TimelineId,
|
||||
base: (Lsn, PathBuf),
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
|
||||
@@ -560,8 +571,9 @@ impl PageServerNode {
|
||||
};
|
||||
|
||||
// Import base
|
||||
let import_cmd =
|
||||
format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||
let import_cmd = format!(
|
||||
"import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
|
||||
);
|
||||
let mut writer = client.copy_in(&import_cmd)?;
|
||||
io::copy(&mut base_reader, &mut writer)?;
|
||||
writer.finish()?;
|
||||
|
||||
@@ -148,31 +148,6 @@ relcache? (I think we do cache nblocks in relcache already, check why that's not
|
||||
Neon)
|
||||
|
||||
|
||||
## Misc change in vacuumlazy.c
|
||||
|
||||
```
|
||||
index 8aab6e324e..c684c4fbee 100644
|
||||
--- a/src/backend/access/heap/vacuumlazy.c
|
||||
+++ b/src/backend/access/heap/vacuumlazy.c
|
||||
@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
|
||||
else if (all_visible_according_to_vm && !PageIsAllVisible(page)
|
||||
&& VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
|
||||
{
|
||||
- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
|
||||
+ /* ZENITH-XXX: all visible hint is not wal-logged
|
||||
+ * FIXME: Replay visibilitymap changes in pageserver
|
||||
+ */
|
||||
+ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
|
||||
vacrel->relname, blkno);
|
||||
visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
|
||||
VISIBILITYMAP_VALID_BITS);
|
||||
```
|
||||
|
||||
|
||||
Is this still needed? If that WARNING happens, it looks like potential corruption that we should
|
||||
fix!
|
||||
|
||||
|
||||
## Use buffer manager when extending VM or FSM
|
||||
|
||||
```
|
||||
|
||||
@@ -155,6 +155,8 @@ for other files and for sockets for incoming connections.
|
||||
#### pg_distrib_dir
|
||||
|
||||
A directory with Postgres installation to use during pageserver activities.
|
||||
Since pageserver supports several postgres versions, `pg_distrib_dir` contains
|
||||
a subdirectory for each version with naming convention `v{PG_MAJOR_VERSION}/`.
|
||||
Inside that dir, a `bin/postgres` binary should be present.
|
||||
|
||||
The default distrib dir is `./pg_install/`.
|
||||
|
||||
@@ -96,7 +96,7 @@ A single virtual environment with all dependencies is described in the single `P
|
||||
sudo apt install python3.9
|
||||
```
|
||||
- Install `poetry`
|
||||
- Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
|
||||
- Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation).
|
||||
- Install dependencies via `./scripts/pysync`.
|
||||
- Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
|
||||
so if you have different version some linting tools can yield different result locally vs in the CI.
|
||||
|
||||
12
libs/pageserver_api/Cargo.toml
Normal file
12
libs/pageserver_api/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "pageserver_api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
const_format = "0.2.21"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
9
libs/pageserver_api/src/lib.rs
Normal file
9
libs/pageserver_api/src/lib.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
pub mod models;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
@@ -7,7 +7,17 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::tenant::TenantState;
|
||||
/// A state of a tenant in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TenantState {
|
||||
/// Tenant is fully operational, its background jobs might be running or not.
|
||||
Active { background_jobs_running: bool },
|
||||
/// A tenant is recognized by pageserver, but not yet ready to operate:
|
||||
/// e.g. not present locally and being downloaded or being read into memory from the file system.
|
||||
Paused,
|
||||
/// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
|
||||
Broken,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
@@ -21,6 +31,7 @@ pub struct TimelineCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub ancestor_start_lsn: Option<Lsn>,
|
||||
pub pg_version: Option<u32>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -138,6 +149,7 @@ pub struct LocalTimelineInfo {
|
||||
pub last_received_msg_lsn: Option<Lsn>,
|
||||
/// the timestamp (in microseconds) of the last received message
|
||||
pub last_received_msg_ts: Option<u128>,
|
||||
pub pg_version: u32,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -161,3 +173,21 @@ pub struct TimelineInfo {
|
||||
pub local: Option<LocalTimelineInfo>,
|
||||
pub remote: Option<RemoteTimelineInfo>,
|
||||
}
|
||||
|
||||
pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
|
||||
|
||||
/// Information for configuring a single fail point
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct FailpointConfig {
|
||||
/// Name of the fail point
|
||||
pub name: String,
|
||||
/// List of actions to take, using the format described in `fail::cfg`
|
||||
///
|
||||
/// We also support `actions = "exit"` to cause the fail point to immediately exit.
|
||||
pub actions: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct TimelineGcRequest {
|
||||
pub gc_horizon: Option<u64>,
|
||||
}
|
||||
@@ -25,4 +25,5 @@ postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d05
|
||||
wal_craft = { path = "wal_craft" }
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.59.1"
|
||||
anyhow = "1.0"
|
||||
bindgen = "0.60.1"
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
use anyhow::{anyhow, Context};
|
||||
use bindgen::callbacks::ParseCallbacks;
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -42,7 +43,7 @@ impl ParseCallbacks for PostgresFfiCallbacks {
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
fn main() -> anyhow::Result<()> {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=bindgen_deps.h");
|
||||
|
||||
@@ -58,7 +59,7 @@ fn main() {
|
||||
for pg_version in &["v14", "v15"] {
|
||||
let mut pg_install_dir_versioned = pg_install_dir.join(pg_version);
|
||||
if pg_install_dir_versioned.is_relative() {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
let cwd = env::current_dir().context("Failed to get current_dir")?;
|
||||
pg_install_dir_versioned = cwd.join("..").join("..").join(pg_install_dir_versioned);
|
||||
}
|
||||
|
||||
@@ -70,21 +71,25 @@ fn main() {
|
||||
let output = Command::new(pg_config_bin)
|
||||
.arg("--includedir-server")
|
||||
.output()
|
||||
.expect("failed to execute `pg_config --includedir-server`");
|
||||
.context("failed to execute `pg_config --includedir-server`")?;
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("`pg_config --includedir-server` failed")
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim_end().into()
|
||||
String::from_utf8(output.stdout)
|
||||
.context("pg_config output is not UTF-8")?
|
||||
.trim_end()
|
||||
.into()
|
||||
} else {
|
||||
pg_install_dir_versioned
|
||||
let server_path = pg_install_dir_versioned
|
||||
.join("include")
|
||||
.join("postgresql")
|
||||
.join("server")
|
||||
.into_os_string()
|
||||
.into_os_string();
|
||||
server_path
|
||||
.into_string()
|
||||
.unwrap()
|
||||
.map_err(|s| anyhow!("Bad postgres server path {s:?}"))?
|
||||
};
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
@@ -132,14 +137,18 @@ fn main() {
|
||||
// Finish the builder and generate the bindings.
|
||||
//
|
||||
.generate()
|
||||
.expect("Unable to generate bindings");
|
||||
.context("Unable to generate bindings")?;
|
||||
|
||||
// Write the bindings to the $OUT_DIR/bindings_$pg_version.rs file.
|
||||
let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
|
||||
let out_path: PathBuf = env::var("OUT_DIR")
|
||||
.context("Couldn't read OUT_DIR environment variable var")?
|
||||
.into();
|
||||
let filename = format!("bindings_{pg_version}.rs");
|
||||
|
||||
bindings
|
||||
.write_to_file(out_path.join(filename))
|
||||
.expect("Couldn't write bindings!");
|
||||
.context("Couldn't write bindings")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,10 +3,14 @@
|
||||
#![allow(non_snake_case)]
|
||||
// bindgen creates some unsafe code with no doc comments.
|
||||
#![allow(clippy::missing_safety_doc)]
|
||||
// suppress warnings on rust 1.53 due to bindgen unit tests.
|
||||
// https://github.com/rust-lang/rust-bindgen/issues/1651
|
||||
#![allow(deref_nullptr)]
|
||||
// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
|
||||
#![allow(clippy::useless_transmute)]
|
||||
// modules included with the postgres_ffi macro depend on the types of the specific version's
|
||||
// types, and trigger a too eager lint.
|
||||
#![allow(clippy::duplicate_mod)]
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
macro_rules! postgres_ffi {
|
||||
@@ -24,12 +28,12 @@ macro_rules! postgres_ffi {
|
||||
stringify!($version),
|
||||
".rs"
|
||||
));
|
||||
|
||||
include!(concat!("pg_constants_", stringify!($version), ".rs"));
|
||||
}
|
||||
pub mod controlfile_utils;
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod waldecoder;
|
||||
pub mod waldecoder_handler;
|
||||
pub mod xlog_utils;
|
||||
|
||||
pub const PG_MAJORVERSION: &str = stringify!($version);
|
||||
@@ -44,6 +48,9 @@ macro_rules! postgres_ffi {
|
||||
postgres_ffi!(v14);
|
||||
postgres_ffi!(v15);
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
pub use v14::bindings::{uint32, uint64, Oid};
|
||||
pub use v14::bindings::{BlockNumber, OffsetNumber};
|
||||
@@ -52,8 +59,11 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
|
||||
|
||||
// Likewise for these, although the assumption that these don't change is a little more iffy.
|
||||
pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
|
||||
pub use v14::bindings::{PageHeaderData, XLogRecord};
|
||||
pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
|
||||
|
||||
pub use v14::bindings::{CheckPoint, ControlFileData};
|
||||
|
||||
// from pg_config.h. These can be changed with configure options --with-blocksize=BLOCKSIZE and
|
||||
// --with-segsize=SEGSIZE, but assume the defaults for now.
|
||||
pub const BLCKSZ: u16 = 8192;
|
||||
@@ -63,6 +73,49 @@ pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
|
||||
|
||||
pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
// Export some version independent functions that are used outside of this mod
|
||||
pub use v14::xlog_utils::encode_logical_message;
|
||||
pub use v14::xlog_utils::get_current_timestamp;
|
||||
pub use v14::xlog_utils::to_pg_timestamp;
|
||||
pub use v14::xlog_utils::XLogFileName;
|
||||
|
||||
pub use v14::bindings::DBState_DB_SHUTDOWNED;
|
||||
|
||||
pub fn bkpimage_is_compressed(bimg_info: u8, version: u32) -> anyhow::Result<bool> {
|
||||
match version {
|
||||
14 => Ok(bimg_info & v14::bindings::BKPIMAGE_IS_COMPRESSED != 0),
|
||||
15 => Ok(bimg_info & v15::bindings::BKPIMAGE_COMPRESS_PGLZ != 0
|
||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_LZ4 != 0
|
||||
|| bimg_info & v15::bindings::BKPIMAGE_COMPRESS_ZSTD != 0),
|
||||
_ => anyhow::bail!("Unknown version {}", version),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_wal_segment(
|
||||
segno: u64,
|
||||
system_id: u64,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, SerializeError> {
|
||||
match pg_version {
|
||||
14 => v14::xlog_utils::generate_wal_segment(segno, system_id),
|
||||
15 => v15::xlog_utils::generate_wal_segment(segno, system_id),
|
||||
_ => Err(SerializeError::BadInput),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_pg_control(
|
||||
pg_control_bytes: &[u8],
|
||||
checkpoint_bytes: &[u8],
|
||||
lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<(Bytes, u64)> {
|
||||
match pg_version {
|
||||
14 => v14::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
15 => v15::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
|
||||
_ => anyhow::bail!("Unknown version {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
// PG timeline is always 1, changing it doesn't have any useful meaning in Neon.
|
||||
//
|
||||
// NOTE: this is not to be confused with Neon timelines; different concept!
|
||||
@@ -74,7 +127,7 @@ pub const PG_TLI: u32 = 1;
|
||||
|
||||
// See TransactionIdIsNormal in transam.h
|
||||
pub const fn transaction_id_is_normal(id: TransactionId) -> bool {
|
||||
id > v14::pg_constants::FIRST_NORMAL_TRANSACTION_ID
|
||||
id > pg_constants::FIRST_NORMAL_TRANSACTION_ID
|
||||
}
|
||||
|
||||
// See TransactionIdPrecedes in transam.c
|
||||
@@ -109,3 +162,76 @@ pub fn page_set_lsn(pg: &mut [u8], lsn: Lsn) {
|
||||
pg[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
|
||||
pg[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
}
|
||||
|
||||
pub mod waldecoder {
|
||||
|
||||
use crate::{v14, v15};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use std::num::NonZeroU32;
|
||||
use thiserror::Error;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub enum State {
|
||||
WaitingForRecord,
|
||||
ReassemblingRecord {
|
||||
recordbuf: BytesMut,
|
||||
contlen: NonZeroU32,
|
||||
},
|
||||
SkippingEverything {
|
||||
skip_until_lsn: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
pub struct WalStreamDecoder {
|
||||
pub lsn: Lsn,
|
||||
pub pg_version: u32,
|
||||
pub inputbuf: BytesMut,
|
||||
pub state: State,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, Clone)]
|
||||
#[error("{msg} at {lsn}")]
|
||||
pub struct WalDecodeError {
|
||||
pub msg: String,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
impl WalStreamDecoder {
|
||||
pub fn new(lsn: Lsn, pg_version: u32) -> WalStreamDecoder {
|
||||
WalStreamDecoder {
|
||||
lsn,
|
||||
pg_version,
|
||||
inputbuf: BytesMut::new(),
|
||||
state: State::WaitingForRecord,
|
||||
}
|
||||
}
|
||||
|
||||
// The latest LSN position fed to the decoder.
|
||||
pub fn available(&self) -> Lsn {
|
||||
self.lsn + self.inputbuf.remaining() as u64
|
||||
}
|
||||
|
||||
pub fn feed_bytes(&mut self, buf: &[u8]) {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
match self.pg_version {
|
||||
// This is a trick to support both versions simultaneously.
|
||||
// See WalStreamDecoderHandler comments.
|
||||
14 => {
|
||||
use self::v14::waldecoder_handler::WalStreamDecoderHandler;
|
||||
self.poll_decode_internal()
|
||||
}
|
||||
15 => {
|
||||
use self::v15::waldecoder_handler::WalStreamDecoderHandler;
|
||||
self.poll_decode_internal()
|
||||
}
|
||||
_ => Err(WalDecodeError {
|
||||
msg: format!("Unknown version {}", self.pg_version),
|
||||
lsn: self.lsn,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL non-relation files.
|
||||
//!
|
||||
use super::pg_constants;
|
||||
use crate::pg_constants;
|
||||
use crate::transaction_id_precedes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
|
||||
@@ -1,14 +1,16 @@
|
||||
//!
|
||||
//! Misc constants, copied from PostgreSQL headers.
|
||||
//!
|
||||
//! Only place version-independent constants here.
|
||||
//!
|
||||
//! TODO: These probably should be auto-generated using bindgen,
|
||||
//! rather than copied by hand. Although on the other hand, it's nice
|
||||
//! to have them all here in one place, and have the ability to add
|
||||
//! comments on them.
|
||||
//!
|
||||
|
||||
use super::bindings::{PageHeaderData, XLogRecord};
|
||||
use crate::BLCKSZ;
|
||||
use crate::{PageHeaderData, XLogRecord};
|
||||
|
||||
//
|
||||
// From pg_tablespace_d.h
|
||||
@@ -16,14 +18,6 @@ use crate::BLCKSZ;
|
||||
pub const DEFAULTTABLESPACE_OID: u32 = 1663;
|
||||
pub const GLOBALTABLESPACE_OID: u32 = 1664;
|
||||
|
||||
//
|
||||
// Fork numbers, from relpath.h
|
||||
//
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
// From storage_xlog.h
|
||||
pub const XLOG_SMGR_CREATE: u8 = 0x10;
|
||||
pub const XLOG_SMGR_TRUNCATE: u8 = 0x20;
|
||||
@@ -114,7 +108,6 @@ pub const XLOG_NEXTOID: u8 = 0x30;
|
||||
pub const XLOG_SWITCH: u8 = 0x40;
|
||||
pub const XLOG_FPI_FOR_HINT: u8 = 0xA0;
|
||||
pub const XLOG_FPI: u8 = 0xB0;
|
||||
pub const DB_SHUTDOWNED: u32 = 1;
|
||||
|
||||
// From multixact.h
|
||||
pub const FIRST_MULTIXACT_ID: u32 = 1;
|
||||
@@ -169,10 +162,6 @@ pub const RM_HEAP_ID: u8 = 10;
|
||||
pub const XLR_INFO_MASK: u8 = 0x0F;
|
||||
pub const XLR_RMGR_INFO_MASK: u8 = 0xF0;
|
||||
|
||||
// from dbcommands_xlog.h
|
||||
pub const XLOG_DBASE_CREATE: u8 = 0x00;
|
||||
pub const XLOG_DBASE_DROP: u8 = 0x10;
|
||||
|
||||
pub const XLOG_TBLSPC_CREATE: u8 = 0x00;
|
||||
pub const XLOG_TBLSPC_DROP: u8 = 0x10;
|
||||
|
||||
@@ -197,8 +186,6 @@ pub const BKPBLOCK_SAME_REL: u8 = 0x80; /* RelFileNode omitted, same as previous
|
||||
|
||||
/* Information stored in bimg_info */
|
||||
pub const BKPIMAGE_HAS_HOLE: u8 = 0x01; /* page image has "hole" */
|
||||
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
|
||||
/* From transam.h */
|
||||
pub const FIRST_NORMAL_TRANSACTION_ID: u32 = 3;
|
||||
|
||||
5
libs/postgres_ffi/src/pg_constants_v14.rs
Normal file
5
libs/postgres_ffi/src/pg_constants_v14.rs
Normal file
@@ -0,0 +1,5 @@
|
||||
pub const XLOG_DBASE_CREATE: u8 = 0x00;
|
||||
pub const XLOG_DBASE_DROP: u8 = 0x10;
|
||||
|
||||
pub const BKPIMAGE_IS_COMPRESSED: u8 = 0x02; /* page image is compressed */
|
||||
pub const BKPIMAGE_APPLY: u8 = 0x04; /* page image should be restored during replay */
|
||||
10
libs/postgres_ffi/src/pg_constants_v15.rs
Normal file
10
libs/postgres_ffi/src/pg_constants_v15.rs
Normal file
@@ -0,0 +1,10 @@
|
||||
pub const XACT_XINFO_HAS_DROPPED_STATS: u32 = 1u32 << 8;
|
||||
|
||||
pub const XLOG_DBASE_CREATE_FILE_COPY: u8 = 0x00;
|
||||
pub const XLOG_DBASE_CREATE_WAL_LOG: u8 = 0x00;
|
||||
pub const XLOG_DBASE_DROP: u8 = 0x20;
|
||||
|
||||
pub const BKPIMAGE_APPLY: u8 = 0x02; /* page image should be restored during replay */
|
||||
pub const BKPIMAGE_COMPRESS_PGLZ: u8 = 0x04; /* page image is compressed */
|
||||
pub const BKPIMAGE_COMPRESS_LZ4: u8 = 0x08; /* page image is compressed */
|
||||
pub const BKPIMAGE_COMPRESS_ZSTD: u8 = 0x10; /* page image is compressed */
|
||||
@@ -1,10 +1,17 @@
|
||||
//!
|
||||
//! Common utilities for dealing with PostgreSQL relation files.
|
||||
//!
|
||||
use super::pg_constants;
|
||||
use once_cell::sync::OnceCell;
|
||||
use regex::Regex;
|
||||
|
||||
//
|
||||
// Fork numbers, from relpath.h
|
||||
//
|
||||
pub const MAIN_FORKNUM: u8 = 0;
|
||||
pub const FSM_FORKNUM: u8 = 1;
|
||||
pub const VISIBILITYMAP_FORKNUM: u8 = 2;
|
||||
pub const INIT_FORKNUM: u8 = 3;
|
||||
|
||||
#[derive(Debug, Clone, thiserror::Error, PartialEq, Eq)]
|
||||
pub enum FilePathError {
|
||||
#[error("invalid relation fork name")]
|
||||
@@ -23,10 +30,10 @@ impl From<core::num::ParseIntError> for FilePathError {
|
||||
pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
match forkname {
|
||||
// "main" is not in filenames, it's implicit if the fork name is not present
|
||||
None => Ok(pg_constants::MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(pg_constants::FSM_FORKNUM),
|
||||
Some("vm") => Ok(pg_constants::VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(pg_constants::INIT_FORKNUM),
|
||||
None => Ok(MAIN_FORKNUM),
|
||||
Some("fsm") => Ok(FSM_FORKNUM),
|
||||
Some("vm") => Ok(VISIBILITYMAP_FORKNUM),
|
||||
Some("init") => Ok(INIT_FORKNUM),
|
||||
Some(_) => Err(FilePathError::InvalidForkName),
|
||||
}
|
||||
}
|
||||
@@ -34,10 +41,10 @@ pub fn forkname_to_number(forkname: Option<&str>) -> Result<u8, FilePathError> {
|
||||
/// Convert Postgres fork number to the right suffix of the relation data file.
|
||||
pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
|
||||
match forknum {
|
||||
pg_constants::MAIN_FORKNUM => None,
|
||||
pg_constants::FSM_FORKNUM => Some("fsm"),
|
||||
pg_constants::VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
pg_constants::INIT_FORKNUM => Some("init"),
|
||||
MAIN_FORKNUM => None,
|
||||
FSM_FORKNUM => Some("fsm"),
|
||||
VISIBILITYMAP_FORKNUM => Some("vm"),
|
||||
INIT_FORKNUM => Some("init"),
|
||||
_ => Some("UNKNOWN FORKNUM"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
//! to look deeper into the WAL records to also understand which blocks they modify, the code
|
||||
//! for that is in pageserver/src/walrecord.rs
|
||||
//!
|
||||
use super::super::waldecoder::{State, WalDecodeError, WalStreamDecoder};
|
||||
use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLogRecord, XLOG_PAGE_MAGIC};
|
||||
use super::xlog_utils::*;
|
||||
use crate::WAL_SEGMENT_SIZE;
|
||||
@@ -16,55 +17,26 @@ use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::num::NonZeroU32;
|
||||
use thiserror::Error;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
enum State {
|
||||
WaitingForRecord,
|
||||
ReassemblingRecord {
|
||||
recordbuf: BytesMut,
|
||||
contlen: NonZeroU32,
|
||||
},
|
||||
SkippingEverything {
|
||||
skip_until_lsn: Lsn,
|
||||
},
|
||||
}
|
||||
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: Lsn,
|
||||
inputbuf: BytesMut,
|
||||
state: State,
|
||||
}
|
||||
|
||||
#[derive(Error, Debug, Clone)]
|
||||
#[error("{msg} at {lsn}")]
|
||||
pub struct WalDecodeError {
|
||||
msg: String,
|
||||
lsn: Lsn,
|
||||
pub trait WalStreamDecoderHandler {
|
||||
fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError>;
|
||||
fn poll_decode_internal(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError>;
|
||||
fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError>;
|
||||
}
|
||||
|
||||
//
|
||||
// WalRecordStream is a Stream that returns a stream of WAL records
|
||||
// FIXME: This isn't a proper rust stream
|
||||
// This is a trick to support several postgres versions simultaneously.
|
||||
//
|
||||
impl WalStreamDecoder {
|
||||
pub fn new(lsn: Lsn) -> WalStreamDecoder {
|
||||
WalStreamDecoder {
|
||||
lsn,
|
||||
inputbuf: BytesMut::new(),
|
||||
state: State::WaitingForRecord,
|
||||
}
|
||||
}
|
||||
|
||||
// The latest LSN position fed to the decoder.
|
||||
pub fn available(&self) -> Lsn {
|
||||
self.lsn + self.inputbuf.remaining() as u64
|
||||
}
|
||||
|
||||
pub fn feed_bytes(&mut self, buf: &[u8]) {
|
||||
self.inputbuf.extend_from_slice(buf);
|
||||
}
|
||||
|
||||
// Page decoding code depends on postgres bindings, so it is compiled for each version.
|
||||
// Thus WalStreamDecoder implements several WalStreamDecoderHandler traits.
|
||||
// WalStreamDecoder poll_decode() method dispatches to the right handler based on the postgres version.
|
||||
// Other methods are internal and are not dispatched.
|
||||
//
|
||||
// It is similar to having several impl blocks for the same struct,
|
||||
// but the impls here are in different modules, so need to use a trait.
|
||||
//
|
||||
impl WalStreamDecoderHandler for WalStreamDecoder {
|
||||
fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> {
|
||||
let validate_impl = || {
|
||||
if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 {
|
||||
@@ -125,7 +97,7 @@ impl WalStreamDecoder {
|
||||
/// Ok(None): there is not enough data in the input buffer. Feed more by calling the `feed_bytes` function
|
||||
/// Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
fn poll_decode_internal(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
// Run state machine that validates page headers, and reassembles records
|
||||
// that cross page boundaries.
|
||||
loop {
|
||||
@@ -9,12 +9,13 @@
|
||||
|
||||
use crc32c::crc32c_append;
|
||||
|
||||
use super::super::waldecoder::WalStreamDecoder;
|
||||
use super::bindings::{
|
||||
CheckPoint, FullTransactionId, TimeLineID, TimestampTz, XLogLongPageHeaderData,
|
||||
XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
|
||||
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
|
||||
};
|
||||
use super::pg_constants;
|
||||
use super::waldecoder::WalStreamDecoder;
|
||||
use super::PG_MAJORVERSION;
|
||||
use crate::pg_constants;
|
||||
use crate::PG_TLI;
|
||||
use crate::{uint32, uint64, Oid};
|
||||
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
@@ -56,12 +57,10 @@ pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
/// in order to let CLOG_TRUNCATE mechanism correctly extend CLOG.
|
||||
const XID_CHECKPOINT_INTERVAL: u32 = 1024;
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegmentsPerXLogId(wal_segsz_bytes: usize) -> XLogSegNo {
|
||||
(0x100000000u64 / wal_segsz_bytes as u64) as XLogSegNo
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogSegNoOffsetToRecPtr(
|
||||
segno: XLogSegNo,
|
||||
offset: u32,
|
||||
@@ -70,7 +69,6 @@ pub fn XLogSegNoOffsetToRecPtr(
|
||||
segno * (wal_segsz_bytes as u64) + (offset as u64)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize) -> String {
|
||||
format!(
|
||||
"{:>08X}{:>08X}{:>08X}",
|
||||
@@ -80,7 +78,6 @@ pub fn XLogFileName(tli: TimeLineID, logSegNo: XLogSegNo, wal_segsz_bytes: usize
|
||||
)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLineID) {
|
||||
let tli = u32::from_str_radix(&fname[0..8], 16).unwrap();
|
||||
let log = u32::from_str_radix(&fname[8..16], 16).unwrap() as XLogSegNo;
|
||||
@@ -88,12 +85,10 @@ pub fn XLogFromFileName(fname: &str, wal_seg_size: usize) -> (XLogSegNo, TimeLin
|
||||
(log * XLogSegmentsPerXLogId(wal_seg_size) + seg, tli)
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsXLogFileName(fname: &str) -> bool {
|
||||
return fname.len() == XLOG_FNAME_LEN && fname.chars().all(|c| c.is_ascii_hexdigit());
|
||||
}
|
||||
|
||||
#[allow(non_snake_case)]
|
||||
pub fn IsPartialXLogFileName(fname: &str) -> bool {
|
||||
fname.ends_with(".partial") && IsXLogFileName(&fname[0..fname.len() - 8])
|
||||
}
|
||||
@@ -113,6 +108,30 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn generate_pg_control(
|
||||
pg_control_bytes: &[u8],
|
||||
checkpoint_bytes: &[u8],
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<(Bytes, u64)> {
|
||||
let mut pg_control = ControlFileData::decode(pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?;
|
||||
|
||||
// Generate new pg_control needed for bootstrap
|
||||
checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
//TODO Check this.
|
||||
//We may need to determine the value from twophase data.
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = 0;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
pg_control.state = DBState_DB_SHUTDOWNED;
|
||||
|
||||
Ok((pg_control.encode(), pg_control.system_identifier))
|
||||
}
|
||||
|
||||
pub fn get_current_timestamp() -> TimestampTz {
|
||||
to_pg_timestamp(SystemTime::now())
|
||||
}
|
||||
@@ -144,7 +163,10 @@ pub fn find_end_of_wal(
|
||||
let mut result = start_lsn;
|
||||
let mut curr_lsn = start_lsn;
|
||||
let mut buf = [0u8; XLOG_BLCKSZ];
|
||||
let mut decoder = WalStreamDecoder::new(start_lsn);
|
||||
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
|
||||
debug!("find_end_of_wal PG_VERSION: {}", pg_version);
|
||||
|
||||
let mut decoder = WalStreamDecoder::new(start_lsn, pg_version);
|
||||
|
||||
// loop over segments
|
||||
loop {
|
||||
@@ -154,7 +176,7 @@ pub fn find_end_of_wal(
|
||||
match open_wal_segment(&seg_file_path)? {
|
||||
None => {
|
||||
// no more segments
|
||||
info!(
|
||||
debug!(
|
||||
"find_end_of_wal reached end at {:?}, segment {:?} doesn't exist",
|
||||
result, seg_file_path
|
||||
);
|
||||
@@ -177,7 +199,7 @@ pub fn find_end_of_wal(
|
||||
match decoder.poll_decode() {
|
||||
Ok(Some(record)) => result = record.0,
|
||||
Err(e) => {
|
||||
info!(
|
||||
debug!(
|
||||
"find_end_of_wal reached end at {:?}, decode error: {:?}",
|
||||
result, e
|
||||
);
|
||||
@@ -438,12 +460,15 @@ mod tests {
|
||||
fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
|
||||
use wal_craft::*;
|
||||
|
||||
let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
|
||||
|
||||
// Craft some WAL
|
||||
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
.join("..");
|
||||
let cfg = Conf {
|
||||
pg_distrib_dir: top_path.join(format!("pg_install/{PG_MAJORVERSION}")),
|
||||
pg_version,
|
||||
pg_distrib_dir: top_path.join("pg_install"),
|
||||
datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
|
||||
};
|
||||
if cfg.datadir.exists() {
|
||||
|
||||
@@ -14,3 +14,4 @@ once_cell = "1.13.0"
|
||||
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres_ffi = { path = "../" }
|
||||
tempfile = "3.2"
|
||||
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
|
||||
|
||||
@@ -37,9 +37,16 @@ fn main() -> Result<()> {
|
||||
Arg::new("pg-distrib-dir")
|
||||
.long("pg-distrib-dir")
|
||||
.takes_value(true)
|
||||
.help("Directory with Postgres distribution (bin and lib directories, e.g. pg_install/v14)")
|
||||
.help("Directory with Postgres distributions (bin and lib directories, e.g. pg_install containing subpath `v14/bin/postgresql`)")
|
||||
.default_value("/usr/local")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-version")
|
||||
.long("pg-version")
|
||||
.help("Postgres version to use for the initial tenant")
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
App::new("in-existing")
|
||||
@@ -82,8 +89,14 @@ fn main() -> Result<()> {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Some(("with-initdb", arg_matches)) => {
|
||||
let cfg = Conf {
|
||||
pg_version: arg_matches
|
||||
.value_of("pg-version")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.context("Failed to parse postgres version from the argument string")?,
|
||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
||||
};
|
||||
|
||||
@@ -15,6 +15,7 @@ use tempfile::{tempdir, TempDir};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Conf {
|
||||
pub pg_version: u32,
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
pub datadir: PathBuf,
|
||||
}
|
||||
@@ -36,12 +37,22 @@ pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
impl Conf {
|
||||
pub fn pg_distrib_dir(&self) -> PathBuf {
|
||||
let path = self.pg_distrib_dir.clone();
|
||||
|
||||
match self.pg_version {
|
||||
14 => path.join(format!("v{}", self.pg_version)),
|
||||
15 => path.join(format!("v{}", self.pg_version)),
|
||||
_ => panic!("Unsupported postgres version: {}", self.pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
self.pg_distrib_dir().join("bin")
|
||||
}
|
||||
|
||||
fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
self.pg_distrib_dir().join("lib")
|
||||
}
|
||||
|
||||
pub fn wal_dir(&self) -> PathBuf {
|
||||
|
||||
@@ -7,6 +7,7 @@ edition = "2021"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
async-trait = "0.1"
|
||||
metrics = { version = "0.1", path = "../metrics" }
|
||||
utils = { version = "0.1", path = "../utils" }
|
||||
once_cell = "1.13.0"
|
||||
rusoto_core = "0.48"
|
||||
rusoto_s3 = "0.48"
|
||||
|
||||
@@ -9,9 +9,7 @@ mod local_fs;
|
||||
mod s3_bucket;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
ffi::OsStr,
|
||||
fmt::{Debug, Display},
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
ops::Deref,
|
||||
@@ -344,22 +342,6 @@ impl Debug for S3Config {
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds a suffix to the file(directory) name, either appending the suffux to the end of its extension,
|
||||
/// or if there's no extension, creates one and puts a suffix there.
|
||||
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
|
||||
let new_extension = match original_path
|
||||
.as_ref()
|
||||
.extension()
|
||||
.map(OsStr::to_string_lossy)
|
||||
{
|
||||
Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
|
||||
None => Cow::Borrowed(suffix),
|
||||
};
|
||||
original_path
|
||||
.as_ref()
|
||||
.with_extension(new_extension.as_ref())
|
||||
}
|
||||
|
||||
impl RemoteStorageConfig {
|
||||
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||
let local_path = toml.get("local_path");
|
||||
@@ -448,35 +430,6 @@ fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_path_with_suffix_extension() {
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp").to_string_lossy(),
|
||||
"/foo/bar.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.baz.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar.baz..temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar/dir/");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar/dir..temp"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn object_name() {
|
||||
let k = RemoteObjectId("a/b/c".to_owned());
|
||||
|
||||
@@ -16,8 +16,9 @@ use tokio::{
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tracing::*;
|
||||
use utils::crashsafe_dir::path_with_suffix_extension;
|
||||
|
||||
use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectId};
|
||||
use crate::{Download, DownloadError, RemoteObjectId};
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
|
||||
12
libs/safekeeper_api/Cargo.toml
Normal file
12
libs/safekeeper_api/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "safekeeper_api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
const_format = "0.2.21"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
10
libs/safekeeper_api/src/lib.rs
Normal file
10
libs/safekeeper_api/src/lib.rs
Normal file
@@ -0,0 +1,10 @@
|
||||
use const_format::formatcp;
|
||||
|
||||
/// Public API types
|
||||
pub mod models;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 5454;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 7676;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
@@ -3,20 +3,20 @@
|
||||
use criterion::{criterion_group, criterion_main, Criterion};
|
||||
use utils::id;
|
||||
|
||||
pub fn bench_zid_stringify(c: &mut Criterion) {
|
||||
pub fn bench_id_stringify(c: &mut Criterion) {
|
||||
// Can only use public methods.
|
||||
let ztl = id::TenantTimelineId::generate();
|
||||
let ttid = id::TenantTimelineId::generate();
|
||||
|
||||
c.bench_function("zid.to_string", |b| {
|
||||
c.bench_function("id.to_string", |b| {
|
||||
b.iter(|| {
|
||||
// FIXME measurement overhead?
|
||||
//for _ in 0..1000 {
|
||||
// ztl.tenant_id.to_string();
|
||||
// ttid.tenant_id.to_string();
|
||||
//}
|
||||
ztl.tenant_id.to_string();
|
||||
ttid.tenant_id.to_string();
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
criterion_group!(benches, bench_zid_stringify);
|
||||
criterion_group!(benches, bench_id_stringify);
|
||||
criterion_main!(benches);
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
ffi::OsStr,
|
||||
fs::{self, File},
|
||||
io,
|
||||
path::Path,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
/// Similar to [`std::fs::create_dir`], except we fsync the
|
||||
@@ -74,6 +76,22 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension,
|
||||
/// or if there's no extension, creates one and puts a suffix there.
|
||||
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
|
||||
let new_extension = match original_path
|
||||
.as_ref()
|
||||
.extension()
|
||||
.map(OsStr::to_string_lossy)
|
||||
{
|
||||
Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
|
||||
None => Cow::Borrowed(suffix),
|
||||
};
|
||||
original_path
|
||||
.as_ref()
|
||||
.with_extension(new_extension.as_ref())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tempfile::tempdir;
|
||||
@@ -122,4 +140,33 @@ mod tests {
|
||||
let invalid_dir_path = file_path.join("folder");
|
||||
create_dir_all(&invalid_dir_path).unwrap_err();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_path_with_suffix_extension() {
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp").to_string_lossy(),
|
||||
"/foo/bar.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.baz.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar.baz..temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar/dir/");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar/dir..temp"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
use anyhow::anyhow;
|
||||
use hyper::{header, Body, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ApiError {
|
||||
#[error("Bad request: {0}")]
|
||||
BadRequest(String),
|
||||
#[error("Bad request: {0:#?}")]
|
||||
BadRequest(anyhow::Error),
|
||||
|
||||
#[error("Forbidden: {0}")]
|
||||
Forbidden(String),
|
||||
@@ -15,24 +14,20 @@ pub enum ApiError {
|
||||
Unauthorized(String),
|
||||
|
||||
#[error("NotFound: {0}")]
|
||||
NotFound(String),
|
||||
NotFound(anyhow::Error),
|
||||
|
||||
#[error("Conflict: {0}")]
|
||||
Conflict(String),
|
||||
|
||||
#[error(transparent)]
|
||||
InternalServerError(#[from] anyhow::Error),
|
||||
InternalServerError(anyhow::Error),
|
||||
}
|
||||
|
||||
impl ApiError {
|
||||
pub fn from_err<E: Into<anyhow::Error>>(err: E) -> Self {
|
||||
Self::InternalServerError(anyhow!(err))
|
||||
}
|
||||
|
||||
pub fn into_response(self) -> Response<Body> {
|
||||
match self {
|
||||
ApiError::BadRequest(_) => HttpErrorBody::response_from_msg_and_status(
|
||||
self.to_string(),
|
||||
ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
|
||||
format!("{err:#?}"), // use debug printing so that we give the cause
|
||||
StatusCode::BAD_REQUEST,
|
||||
),
|
||||
ApiError::Forbidden(_) => {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use anyhow::Context;
|
||||
use bytes::Buf;
|
||||
use hyper::{header, Body, Request, Response, StatusCode};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -9,20 +10,24 @@ pub async fn json_request<T: for<'de> Deserialize<'de>>(
|
||||
) -> Result<T, ApiError> {
|
||||
let whole_body = hyper::body::aggregate(request.body_mut())
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.context("Failed to read request body")
|
||||
.map_err(ApiError::BadRequest)?;
|
||||
serde_json::from_reader(whole_body.reader())
|
||||
.map_err(|err| ApiError::BadRequest(format!("Failed to parse json request {}", err)))
|
||||
.context("Failed to parse json request")
|
||||
.map_err(ApiError::BadRequest)
|
||||
}
|
||||
|
||||
pub fn json_response<T: Serialize>(
|
||||
status: StatusCode,
|
||||
data: T,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let json = serde_json::to_string(&data).map_err(ApiError::from_err)?;
|
||||
let json = serde_json::to_string(&data)
|
||||
.context("Failed to serialize JSON response")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
let response = Response::builder()
|
||||
.status(status)
|
||||
.header(header::CONTENT_TYPE, "application/json")
|
||||
.body(Body::from(json))
|
||||
.map_err(ApiError::from_err)?;
|
||||
.map_err(|e| ApiError::InternalServerError(e.into()))?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::error::ApiError;
|
||||
use anyhow::anyhow;
|
||||
use hyper::{body::HttpBody, Body, Request};
|
||||
use routerify::ext::RequestExt;
|
||||
|
||||
@@ -10,9 +11,8 @@ pub fn get_request_param<'a>(
|
||||
) -> Result<&'a str, ApiError> {
|
||||
match request.param(param_name) {
|
||||
Some(arg) => Ok(arg),
|
||||
None => Err(ApiError::BadRequest(format!(
|
||||
"no {} specified in path param",
|
||||
param_name
|
||||
None => Err(ApiError::BadRequest(anyhow!(
|
||||
"no {param_name} specified in path param",
|
||||
))),
|
||||
}
|
||||
}
|
||||
@@ -23,16 +23,15 @@ pub fn parse_request_param<T: FromStr>(
|
||||
) -> Result<T, ApiError> {
|
||||
match get_request_param(request, param_name)?.parse() {
|
||||
Ok(v) => Ok(v),
|
||||
Err(_) => Err(ApiError::BadRequest(format!(
|
||||
"failed to parse {}",
|
||||
param_name
|
||||
Err(_) => Err(ApiError::BadRequest(anyhow!(
|
||||
"failed to parse {param_name}",
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
|
||||
match request.body_mut().data().await {
|
||||
Some(_) => Err(ApiError::BadRequest("Unexpected request body".into())),
|
||||
Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -429,8 +429,22 @@ impl PostgresBackend {
|
||||
// full cause of the error, not just the top-level context + its trace.
|
||||
// We don't want to send that in the ErrorResponse though,
|
||||
// because it's not relevant to the compute node logs.
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?;
|
||||
//
|
||||
// We also don't want to log full stacktrace when the error is primitive,
|
||||
// such as usual connection closed.
|
||||
let short_error = format!("{:#}", e);
|
||||
let root_cause = e.root_cause().to_string();
|
||||
if root_cause.contains("connection closed unexpectedly")
|
||||
|| root_cause.contains("Broken pipe (os error 32)")
|
||||
{
|
||||
error!(
|
||||
"query handler for '{}' failed: {}",
|
||||
query_string, short_error
|
||||
);
|
||||
} else {
|
||||
error!("query handler for '{}' failed: {:?}", query_string, e);
|
||||
}
|
||||
self.write_message_noflush(&BeMessage::ErrorResponse(&short_error))?;
|
||||
// TODO: untangle convoluted control flow
|
||||
if e.to_string().contains("failed to run") {
|
||||
return Ok(ProcessMsgResult::Break);
|
||||
|
||||
@@ -931,7 +931,7 @@ impl ReplicationFeedback {
|
||||
|
||||
// Deserialize ReplicationFeedback message
|
||||
pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
|
||||
let mut zf = ReplicationFeedback::empty();
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
let nfields = buf.get_u8();
|
||||
for _ in 0..nfields {
|
||||
let key = read_cstr(&mut buf).unwrap();
|
||||
@@ -939,31 +939,31 @@ impl ReplicationFeedback {
|
||||
b"current_timeline_size" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.current_timeline_size = buf.get_u64();
|
||||
rf.current_timeline_size = buf.get_u64();
|
||||
}
|
||||
b"ps_writelsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.ps_writelsn = buf.get_u64();
|
||||
rf.ps_writelsn = buf.get_u64();
|
||||
}
|
||||
b"ps_flushlsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.ps_flushlsn = buf.get_u64();
|
||||
rf.ps_flushlsn = buf.get_u64();
|
||||
}
|
||||
b"ps_applylsn" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
zf.ps_applylsn = buf.get_u64();
|
||||
rf.ps_applylsn = buf.get_u64();
|
||||
}
|
||||
b"ps_replytime" => {
|
||||
let len = buf.get_i32();
|
||||
assert_eq!(len, 8);
|
||||
let raw_time = buf.get_i64();
|
||||
if raw_time > 0 {
|
||||
zf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_micros(raw_time as u64);
|
||||
} else {
|
||||
zf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
rf.ps_replytime = *PG_EPOCH - Duration::from_micros(-raw_time as u64);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
@@ -976,8 +976,8 @@ impl ReplicationFeedback {
|
||||
}
|
||||
}
|
||||
}
|
||||
trace!("ReplicationFeedback parsed is {:?}", zf);
|
||||
zf
|
||||
trace!("ReplicationFeedback parsed is {:?}", rf);
|
||||
rf
|
||||
}
|
||||
}
|
||||
|
||||
@@ -987,29 +987,29 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_serialization() {
|
||||
let mut zf = ReplicationFeedback::empty();
|
||||
// Fill zf with some values
|
||||
zf.current_timeline_size = 12345678;
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
zf.serialize(&mut data).unwrap();
|
||||
rf.serialize(&mut data).unwrap();
|
||||
|
||||
let zf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
assert_eq!(zf, zf_parsed);
|
||||
let rf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_replication_feedback_unknown_key() {
|
||||
let mut zf = ReplicationFeedback::empty();
|
||||
// Fill zf with some values
|
||||
zf.current_timeline_size = 12345678;
|
||||
let mut rf = ReplicationFeedback::empty();
|
||||
// Fill rf with some values
|
||||
rf.current_timeline_size = 12345678;
|
||||
// Set rounded time to be able to compare it with deserialized value,
|
||||
// because it is rounded up to microseconds during serialization.
|
||||
zf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
rf.ps_replytime = *PG_EPOCH + Duration::from_secs(100_000_000);
|
||||
let mut data = BytesMut::new();
|
||||
zf.serialize(&mut data).unwrap();
|
||||
rf.serialize(&mut data).unwrap();
|
||||
|
||||
// Add an extra field to the buffer and adjust number of keys
|
||||
if let Some(first) = data.first_mut() {
|
||||
@@ -1021,8 +1021,8 @@ mod tests {
|
||||
data.put_u64(42);
|
||||
|
||||
// Parse serialized data and check that new field is not parsed
|
||||
let zf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
assert_eq!(zf, zf_parsed);
|
||||
let rf_parsed = ReplicationFeedback::parse(data.freeze());
|
||||
assert_eq!(rf, rf_parsed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -240,7 +240,6 @@ where
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::Arc;
|
||||
use std::thread::sleep;
|
||||
use std::time::Duration;
|
||||
|
||||
impl MonotonicCounter<i32> for i32 {
|
||||
@@ -258,17 +257,19 @@ mod tests {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
let seq3 = Arc::clone(&seq);
|
||||
tokio::task::spawn(async move {
|
||||
let jh1 = tokio::task::spawn(async move {
|
||||
seq2.wait_for(42).await.expect("wait_for 42");
|
||||
let old = seq2.advance(100);
|
||||
assert_eq!(old, 99);
|
||||
seq2.wait_for(999).await.expect_err("no 999");
|
||||
seq2.wait_for_timeout(999, Duration::from_millis(100))
|
||||
.await
|
||||
.expect_err("no 999");
|
||||
});
|
||||
tokio::task::spawn(async move {
|
||||
let jh2 = tokio::task::spawn(async move {
|
||||
seq3.wait_for(42).await.expect("wait_for 42");
|
||||
seq3.wait_for(0).await.expect("wait_for 0");
|
||||
});
|
||||
sleep(Duration::from_secs(1));
|
||||
tokio::time::sleep(Duration::from_millis(200)).await;
|
||||
let old = seq.advance(99);
|
||||
assert_eq!(old, 0);
|
||||
seq.wait_for(100).await.expect("wait_for 100");
|
||||
@@ -277,6 +278,9 @@ mod tests {
|
||||
assert_eq!(seq.advance(98), 100);
|
||||
assert_eq!(seq.load(), 100);
|
||||
|
||||
jh1.await.unwrap();
|
||||
jh2.await.unwrap();
|
||||
|
||||
seq.shutdown();
|
||||
}
|
||||
|
||||
@@ -284,15 +288,18 @@ mod tests {
|
||||
async fn seqwait_timeout() {
|
||||
let seq = Arc::new(SeqWait::new(0));
|
||||
let seq2 = Arc::clone(&seq);
|
||||
tokio::task::spawn(async move {
|
||||
let jh = tokio::task::spawn(async move {
|
||||
let timeout = Duration::from_millis(1);
|
||||
let res = seq2.wait_for_timeout(42, timeout).await;
|
||||
assert_eq!(res, Err(SeqWaitError::Timeout));
|
||||
});
|
||||
tokio::time::sleep(Duration::from_secs(1)).await;
|
||||
tokio::time::sleep(Duration::from_millis(200)).await;
|
||||
// This will attempt to wake, but nothing will happen
|
||||
// because the waiter already dropped its Receiver.
|
||||
let old = seq.advance(99);
|
||||
assert_eq!(old, 0)
|
||||
assert_eq!(old, 0);
|
||||
jh.await.unwrap();
|
||||
|
||||
seq.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,12 +4,12 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[features]
|
||||
# It is simpler infra-wise to have failpoints enabled by default
|
||||
# It shouldn't affect performance in any way because failpoints
|
||||
# are not placed in hot code paths
|
||||
default = ["failpoints"]
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints"]
|
||||
|
||||
profiling = ["pprof"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
async-stream = "0.3"
|
||||
@@ -54,7 +54,11 @@ once_cell = "1.13.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
rstar = "0.9.3"
|
||||
num-traits = "0.2.15"
|
||||
amplify_num = "0.4.1"
|
||||
|
||||
pageserver_api = { path = "../libs/pageserver_api" }
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
|
||||
@@ -25,10 +25,10 @@ use tracing::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::v14::xlog_utils::{generate_wal_segment, normalize_lsn, XLogFileName};
|
||||
use postgres_ffi::v14::{CheckPoint, ControlFileData};
|
||||
use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
|
||||
use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
|
||||
use postgres_ffi::TransactionId;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::PG_TLI;
|
||||
use postgres_ffi::{BLCKSZ, RELSEG_SIZE, WAL_SEGMENT_SIZE};
|
||||
use utils::lsn::Lsn;
|
||||
@@ -129,15 +129,15 @@ where
|
||||
// TODO include checksum
|
||||
|
||||
// Create pgdata subdirs structure
|
||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||
for dir in PGDATA_SUBDIRS.iter() {
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
}
|
||||
|
||||
// Send empty config files.
|
||||
for filepath in pg_constants::PGDATA_SPECIAL_FILES.iter() {
|
||||
for filepath in PGDATA_SPECIAL_FILES.iter() {
|
||||
if *filepath == "pg_hba.conf" {
|
||||
let data = pg_constants::PG_HBA.as_bytes();
|
||||
let data = PG_HBA.as_bytes();
|
||||
let header = new_tar_header(filepath, data.len() as u64)?;
|
||||
self.ar.append(&header, data)?;
|
||||
} else {
|
||||
@@ -267,16 +267,12 @@ where
|
||||
None
|
||||
};
|
||||
|
||||
// TODO pass this as a parameter
|
||||
let pg_version = "14";
|
||||
if spcnode == GLOBALTABLESPACE_OID {
|
||||
let pg_version_str = self.timeline.pg_version.to_string();
|
||||
let header = new_tar_header("PG_VERSION", pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes())?;
|
||||
|
||||
if spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
let version_bytes = pg_version.as_bytes();
|
||||
let header = new_tar_header("PG_VERSION", version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
|
||||
let header = new_tar_header("global/PG_VERSION", version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
info!("timeline.pg_version {}", self.timeline.pg_version);
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
// filenode map for global tablespace
|
||||
@@ -305,7 +301,7 @@ where
|
||||
return Ok(());
|
||||
}
|
||||
// User defined tablespaces are not supported
|
||||
ensure!(spcnode == pg_constants::DEFAULTTABLESPACE_OID);
|
||||
ensure!(spcnode == DEFAULTTABLESPACE_OID);
|
||||
|
||||
// Append dir path for each database
|
||||
let path = format!("base/{}", dbnode);
|
||||
@@ -314,9 +310,10 @@ where
|
||||
|
||||
if let Some(img) = relmap_img {
|
||||
let dst_path = format!("base/{}/PG_VERSION", dbnode);
|
||||
let version_bytes = pg_version.as_bytes();
|
||||
let header = new_tar_header(&dst_path, version_bytes.len() as u64)?;
|
||||
self.ar.append(&header, version_bytes)?;
|
||||
|
||||
let pg_version_str = self.timeline.pg_version.to_string();
|
||||
let header = new_tar_header(&dst_path, pg_version_str.len() as u64)?;
|
||||
self.ar.append(&header, pg_version_str.as_bytes())?;
|
||||
|
||||
let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
|
||||
let header = new_tar_header(&relmap_path, img.len() as u64)?;
|
||||
@@ -348,30 +345,6 @@ where
|
||||
// Also send zenith.signal file with extra bootstrap data.
|
||||
//
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_checkpoint(self.lsn)
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_control_file(self.lsn)
|
||||
.context("failed get control bytes")?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
// Generate new pg_control needed for bootstrap
|
||||
checkpoint.redo = normalize_lsn(self.lsn, WAL_SEGMENT_SIZE).0;
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
//TODO Check this.
|
||||
//We may need to determine the value from twophase data.
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = 0;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
pg_control.state = pg_constants::DB_SHUTDOWNED;
|
||||
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
@@ -388,8 +361,23 @@ where
|
||||
zenith_signal.as_bytes(),
|
||||
)?;
|
||||
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_checkpoint(self.lsn)
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_control_file(self.lsn)
|
||||
.context("failed get control bytes")?;
|
||||
|
||||
let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
|
||||
&pg_control_bytes,
|
||||
&checkpoint_bytes,
|
||||
self.lsn,
|
||||
self.timeline.pg_version,
|
||||
)?;
|
||||
|
||||
//send pg_control
|
||||
let pg_control_bytes = pg_control.encode();
|
||||
let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
|
||||
self.ar.append(&header, &pg_control_bytes[..])?;
|
||||
|
||||
@@ -398,8 +386,10 @@ where
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, WAL_SEGMENT_SIZE);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
|
||||
let wal_seg =
|
||||
postgres_ffi::generate_wal_segment(segno, system_identifier, self.timeline.pg_version)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
ensure!(wal_seg.len() == WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
|
||||
@@ -87,8 +87,8 @@ fn main() -> anyhow::Result<()> {
|
||||
|
||||
if arg_matches.is_present("enabled-features") {
|
||||
let features: &[&str] = &[
|
||||
#[cfg(feature = "failpoints")]
|
||||
"failpoints",
|
||||
#[cfg(feature = "testing")]
|
||||
"testing",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
|
||||
@@ -50,6 +50,7 @@ fn main() -> Result<()> {
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
@@ -62,6 +63,7 @@ fn main() -> Result<()> {
|
||||
meta.ancestor_lsn(),
|
||||
meta.latest_gc_cutoff_lsn(),
|
||||
meta.initdb_lsn(),
|
||||
meta.pg_version(),
|
||||
);
|
||||
update_meta = true;
|
||||
}
|
||||
|
||||
@@ -22,14 +22,18 @@ use utils::{
|
||||
use crate::tenant::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant_config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
pub use pageserver_api::{
|
||||
DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
|
||||
DEFAULT_PG_LISTEN_PORT,
|
||||
};
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
@@ -205,7 +209,7 @@ impl Default for PageServerConfigBuilder {
|
||||
workdir: Set(PathBuf::new()),
|
||||
pg_distrib_dir: Set(env::current_dir()
|
||||
.expect("cannot access current directory")
|
||||
.join("pg_install/v14")),
|
||||
.join("pg_install")),
|
||||
auth_type: Set(AuthType::Trust),
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
@@ -346,6 +350,12 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_id.to_string())
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn tenant_config_path(&self, tenant_id: TenantId) -> PathBuf {
|
||||
self.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
|
||||
}
|
||||
|
||||
pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
|
||||
self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
|
||||
}
|
||||
@@ -368,18 +378,42 @@ impl PageServerConf {
|
||||
.join(tenant_id.to_string())
|
||||
.join(timeline_id.to_string())
|
||||
.join(connection_id.to_string())
|
||||
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(&self, timeline_id: TimelineId, tenant_id: TenantId) -> PathBuf {
|
||||
self.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
pub fn pg_distrib_dir(&self, pg_version: u32) -> PathBuf {
|
||||
let path = self.pg_distrib_dir.clone();
|
||||
|
||||
pub fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
match pg_version {
|
||||
14 => path.join(format!("v{pg_version}")),
|
||||
15 => path.join(format!("v{pg_version}")),
|
||||
_ => panic!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
pub fn pg_bin_dir(&self, pg_version: u32) -> PathBuf {
|
||||
match pg_version {
|
||||
14 => self.pg_distrib_dir(pg_version).join("bin"),
|
||||
15 => self.pg_distrib_dir(pg_version).join("bin"),
|
||||
_ => panic!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
pub fn pg_lib_dir(&self, pg_version: u32) -> PathBuf {
|
||||
match pg_version {
|
||||
14 => self.pg_distrib_dir(pg_version).join("lib"),
|
||||
15 => self.pg_distrib_dir(pg_version).join("lib"),
|
||||
_ => panic!("Unsupported postgres version: {}", pg_version),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a configuration file (pageserver.toml) into a PageServerConf struct,
|
||||
@@ -448,13 +482,6 @@ impl PageServerConf {
|
||||
);
|
||||
}
|
||||
|
||||
if !conf.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
conf.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||
|
||||
Ok(conf)
|
||||
@@ -624,6 +651,7 @@ mod tests {
|
||||
use tempfile::{tempdir, TempDir};
|
||||
|
||||
use super::*;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
|
||||
const ALL_BASE_VALUES_TOML: &str = r#"
|
||||
# Initial configuration file created by 'pageserver --init'
|
||||
@@ -863,8 +891,9 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
fs::create_dir_all(&workdir)?;
|
||||
|
||||
let pg_distrib_dir = tempdir_path.join("pg_distrib");
|
||||
fs::create_dir_all(&pg_distrib_dir)?;
|
||||
let postgres_bin_dir = pg_distrib_dir.join("bin");
|
||||
let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}"));
|
||||
fs::create_dir_all(&pg_distrib_dir_versioned)?;
|
||||
let postgres_bin_dir = pg_distrib_dir_versioned.join("bin");
|
||||
fs::create_dir_all(&postgres_bin_dir)?;
|
||||
fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?;
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod models;
|
||||
pub mod routes;
|
||||
pub use routes::make_router;
|
||||
|
||||
pub use pageserver_api::models;
|
||||
|
||||
@@ -307,6 +307,7 @@ paths:
|
||||
description: |
|
||||
Create a timeline. Returns new timeline id on success.\
|
||||
If no new timeline id is specified in parameters, it would be generated. It's an error to recreate the same timeline.
|
||||
If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
@@ -322,6 +323,8 @@ paths:
|
||||
ancestor_start_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
pg_version:
|
||||
type: integer
|
||||
responses:
|
||||
"201":
|
||||
description: TimelineInfo
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tokio::task::JoinError;
|
||||
use tracing::*;
|
||||
|
||||
use super::models::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
@@ -15,7 +16,7 @@ use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant::{TenantState, Timeline};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||
use crate::{config::PageServerConf, tenant_mgr};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::{
|
||||
@@ -29,6 +30,12 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
// Imports only used for testing APIs
|
||||
#[cfg(feature = "testing")]
|
||||
use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
|
||||
#[cfg(feature = "testing")]
|
||||
use crate::CheckpointConfig;
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
@@ -123,6 +130,7 @@ fn local_timeline_info_from_timeline(
|
||||
wal_source_connstr,
|
||||
last_received_msg_lsn,
|
||||
last_received_msg_ts,
|
||||
pg_version: timeline.pg_version,
|
||||
};
|
||||
Ok(info)
|
||||
}
|
||||
@@ -160,17 +168,18 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
let request_data: TimelineCreateRequest = json_request(&mut request).await?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let new_timeline_info = async {
|
||||
match timelines::create_timeline(
|
||||
get_config(&request),
|
||||
tenant_id,
|
||||
match tenant.create_timeline(
|
||||
request_data.new_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_timeline_id.map(TimelineId::from),
|
||||
request_data.ancestor_start_lsn,
|
||||
request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
|
||||
).await {
|
||||
Ok(Some(new_timeline)) => {
|
||||
// Created. Construct a TimelineInfo for it.
|
||||
let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)?;
|
||||
let local_info = local_timeline_info_from_timeline(&new_timeline, false, false)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
Ok(Some(TimelineInfo {
|
||||
tenant_id,
|
||||
timeline_id: new_timeline.timeline_id,
|
||||
@@ -179,12 +188,11 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
|
||||
}))
|
||||
}
|
||||
Ok(None) => Ok(None), // timeline already exists
|
||||
Err(err) => Err(err),
|
||||
Err(err) => Err(ApiError::InternalServerError(err)),
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
|
||||
.await?;
|
||||
|
||||
Ok(match new_timeline_info {
|
||||
Some(info) => json_response(StatusCode::CREATED, info)?,
|
||||
@@ -202,10 +210,11 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
let timelines = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("timeline_list", tenant = %tenant_id).entered();
|
||||
Ok::<_, anyhow::Error>(tenant_mgr::get_tenant(tenant_id, true)?.list_timelines())
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
Ok(tenant.list_timelines())
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
let mut response_data = Vec::with_capacity(timelines.len());
|
||||
for (timeline_id, timeline) in timelines {
|
||||
@@ -270,7 +279,7 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
tenant_mgr::get_tenant(tenant_id, true)?.get_timeline(timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let local_timeline_info = match timeline.and_then(|timeline| {
|
||||
local_timeline_info_from_timeline(
|
||||
@@ -298,13 +307,13 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
})
|
||||
};
|
||||
Ok::<_, anyhow::Error>((local_timeline_info, remote_timeline_info))
|
||||
Ok::<_, ApiError>((local_timeline_info, remote_timeline_info))
|
||||
}
|
||||
.instrument(info_span!("timeline_detail", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await?;
|
||||
|
||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||
Err(ApiError::NotFound(format!(
|
||||
Err(ApiError::NotFound(anyhow!(
|
||||
"Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
|
||||
)))
|
||||
} else {
|
||||
@@ -327,14 +336,21 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
|
||||
info!("Handling tenant attach {tenant_id}");
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_tenant(tenant_id, false).is_ok() {
|
||||
anyhow::bail!("Tenant is already present locally")
|
||||
};
|
||||
Ok(())
|
||||
tokio::task::spawn_blocking(move || match tenant_mgr::get_tenant(tenant_id, false) {
|
||||
Ok(tenant) => {
|
||||
if tenant.list_timelines().is_empty() {
|
||||
info!("Attaching to tenant {tenant_id} with zero timelines");
|
||||
Ok(())
|
||||
} else {
|
||||
Err(ApiError::Conflict(
|
||||
"Tenant is already present locally".to_owned(),
|
||||
))
|
||||
}
|
||||
}
|
||||
Err(_) => Ok(()),
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
@@ -359,12 +375,12 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
// download index parts for every tenant timeline
|
||||
let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await {
|
||||
Ok(Some(remote_timelines)) => remote_timelines,
|
||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())),
|
||||
Ok(None) => return Err(ApiError::NotFound(anyhow!("Unknown remote tenant"))),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote tenant data: {:?}", e);
|
||||
return Err(ApiError::NotFound(
|
||||
"Failed to retrieve remote tenant".to_string(),
|
||||
));
|
||||
return Err(ApiError::NotFound(anyhow!(
|
||||
"Failed to retrieve remote tenant"
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -387,7 +403,8 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
for (timeline_id, mut remote_timeline) in remote_timelines {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
.context("Failed to create new timeline directory")
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
remote_timeline.awaits_download = true;
|
||||
tenant_entry.insert(timeline_id, remote_timeline);
|
||||
@@ -433,7 +450,10 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
|
||||
tenant_mgr::delete_timeline(tenant_id, timeline_id)
|
||||
.instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
// FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
|
||||
// user and internal errors. Replace this with better handling once the error type permits
|
||||
// it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_timeline_entry(TenantTimelineId {
|
||||
@@ -453,7 +473,9 @@ async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
tenant_mgr::detach_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("tenant_detach", tenant = %tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
// FIXME: Errors from `detach_tenant` can be caused by both both user and internal errors.
|
||||
// Replace this with better handling once the error type permits it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_tenant_entry(&tenant_id);
|
||||
@@ -473,7 +495,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
crate::tenant_mgr::list_tenant_info(&remote_index)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -485,7 +507,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||
let tenant = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant(tenant_id, false))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
@@ -514,7 +536,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
let current_physical_size =
|
||||
match tokio::task::spawn_blocking(move || list_local_timelines(tenant_id, false, false))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))?
|
||||
{
|
||||
Err(err) => {
|
||||
// Getting local timelines can fail when no local tenant directory is on disk (e.g, when tenant data is being downloaded).
|
||||
@@ -540,6 +562,16 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
|
||||
)
|
||||
}
|
||||
|
||||
// Helper function to standardize the error messages we produce on bad durations
|
||||
//
|
||||
// Intended to be used with anyhow's `with_context`, e.g.:
|
||||
//
|
||||
// let value = result.with_context(bad_duration("name", &value))?;
|
||||
//
|
||||
fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
|
||||
move || format!("Cannot parse `{field_name}` duration {value:?}")
|
||||
}
|
||||
|
||||
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
check_permission(&request, None)?;
|
||||
|
||||
@@ -548,25 +580,39 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
tenant_conf.gc_period = Some(
|
||||
humantime::parse_duration(&gc_period)
|
||||
.with_context(bad_duration("gc_period", &gc_period))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
tenant_conf.pitr_interval = Some(
|
||||
humantime::parse_duration(&pitr_interval)
|
||||
.with_context(bad_duration("pitr_interval", &pitr_interval))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
|
||||
if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
|
||||
tenant_conf.walreceiver_connect_timeout = Some(
|
||||
humantime::parse_duration(&walreceiver_connect_timeout).map_err(ApiError::from_err)?,
|
||||
humantime::parse_duration(&walreceiver_connect_timeout)
|
||||
.with_context(bad_duration(
|
||||
"walreceiver_connect_timeout",
|
||||
&walreceiver_connect_timeout,
|
||||
))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
|
||||
tenant_conf.lagging_wal_timeout =
|
||||
Some(humantime::parse_duration(&lagging_wal_timeout).map_err(ApiError::from_err)?);
|
||||
tenant_conf.lagging_wal_timeout = Some(
|
||||
humantime::parse_duration(&lagging_wal_timeout)
|
||||
.with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
@@ -577,16 +623,22 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
|
||||
tenant_conf.checkpoint_timeout =
|
||||
Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
|
||||
tenant_conf.checkpoint_timeout = Some(
|
||||
humantime::parse_duration(&checkpoint_timeout)
|
||||
.with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
tenant_conf.compaction_period = Some(
|
||||
humantime::parse_duration(&compaction_period)
|
||||
.with_context(bad_duration("compaction_period", &compaction_period))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
|
||||
let target_tenant_id = request_data
|
||||
@@ -599,9 +651,12 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let conf = get_config(&request);
|
||||
|
||||
tenant_mgr::create_tenant(conf, tenant_conf, target_tenant_id, remote_index)
|
||||
// FIXME: `create_tenant` can fail from both user and internal errors. Replace this
|
||||
// with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
Ok(match new_tenant_id {
|
||||
Some(id) => json_response(StatusCode::CREATED, TenantCreateResponse(id))?,
|
||||
@@ -616,24 +671,38 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let mut tenant_conf: TenantConfOpt = Default::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
tenant_conf.gc_period = Some(
|
||||
humantime::parse_duration(&gc_period)
|
||||
.with_context(bad_duration("gc_period", &gc_period))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
tenant_conf.pitr_interval = Some(
|
||||
humantime::parse_duration(&pitr_interval)
|
||||
.with_context(bad_duration("pitr_interval", &pitr_interval))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
if let Some(walreceiver_connect_timeout) = request_data.walreceiver_connect_timeout {
|
||||
tenant_conf.walreceiver_connect_timeout = Some(
|
||||
humantime::parse_duration(&walreceiver_connect_timeout).map_err(ApiError::from_err)?,
|
||||
humantime::parse_duration(&walreceiver_connect_timeout)
|
||||
.with_context(bad_duration(
|
||||
"walreceiver_connect_timeout",
|
||||
&walreceiver_connect_timeout,
|
||||
))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
if let Some(lagging_wal_timeout) = request_data.lagging_wal_timeout {
|
||||
tenant_conf.lagging_wal_timeout =
|
||||
Some(humantime::parse_duration(&lagging_wal_timeout).map_err(ApiError::from_err)?);
|
||||
tenant_conf.lagging_wal_timeout = Some(
|
||||
humantime::parse_duration(&lagging_wal_timeout)
|
||||
.with_context(bad_duration("lagging_wal_timeout", &lagging_wal_timeout))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
|
||||
tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
|
||||
@@ -641,15 +710,21 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
if let Some(checkpoint_timeout) = request_data.checkpoint_timeout {
|
||||
tenant_conf.checkpoint_timeout =
|
||||
Some(humantime::parse_duration(&checkpoint_timeout).map_err(ApiError::from_err)?);
|
||||
tenant_conf.checkpoint_timeout = Some(
|
||||
humantime::parse_duration(&checkpoint_timeout)
|
||||
.with_context(bad_duration("checkpoint_timeout", &checkpoint_timeout))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
tenant_conf.compaction_period = Some(
|
||||
humantime::parse_duration(&compaction_period)
|
||||
.with_context(bad_duration("compaction_period", &compaction_period))
|
||||
.map_err(ApiError::BadRequest)?,
|
||||
);
|
||||
}
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
@@ -657,9 +732,114 @@ async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::update_tenant_config(state.conf, tenant_conf, tenant_id)
|
||||
// FIXME: `update_tenant_config` can fail because of both user and internal errors.
|
||||
// Replace this `map_err` with better error handling once the type permits it
|
||||
.map_err(ApiError::InternalServerError)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
.map_err(|e: JoinError| ApiError::InternalServerError(e.into()))??;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "testing", feature = "failpoints"))]
|
||||
async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
if !fail::has_failpoints() {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Cannot manage failpoints because pageserver was compiled without failpoints support"
|
||||
)));
|
||||
}
|
||||
|
||||
let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
|
||||
for fp in failpoints {
|
||||
info!("cfg failpoint: {} {}", fp.name, fp.actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
let cfg_result = if fp.actions == "exit" {
|
||||
fail::cfg_callback(fp.name, || {
|
||||
info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
})
|
||||
} else {
|
||||
fail::cfg(fp.name, &fp.actions)
|
||||
};
|
||||
|
||||
if let Err(err_msg) = cfg_result {
|
||||
return Err(ApiError::BadRequest(anyhow!(
|
||||
"Failed to configure failpoints: {err_msg}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Run GC immediately on given timeline.
|
||||
// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
|
||||
// This probably should require special authentication or a global flag to
|
||||
// enable, I don't think we want to or need to allow regular clients to invoke
|
||||
// GC.
|
||||
// @hllinnaka in commits ec44f4b29, 3aca717f3
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// FIXME: currently this will return a 500 error on bad tenant id; it should be 4XX
|
||||
let repo = tenant_mgr::get_tenant(tenant_id, false).map_err(ApiError::NotFound)?;
|
||||
let gc_req: TimelineGcRequest = json_request(&mut request).await?;
|
||||
|
||||
let _span_guard =
|
||||
info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id).entered();
|
||||
let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| repo.get_gc_horizon());
|
||||
|
||||
// Use tenant's pitr setting
|
||||
let pitr = repo.get_pitr_interval();
|
||||
let result = repo
|
||||
.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)
|
||||
// FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
|
||||
// better once the types support it.
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
json_response(StatusCode::OK, result)
|
||||
}
|
||||
|
||||
// Run compaction immediately on given timeline.
|
||||
// FIXME This is just for tests. Don't expect this to be exposed to
|
||||
// the users or the api.
|
||||
// @dhammika in commit a0781f229
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = repo
|
||||
.get_timeline(timeline_id)
|
||||
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline.compact().map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
// Run checkpoint immediately on given timeline.
|
||||
#[cfg(feature = "testing")]
|
||||
async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let repo = tenant_mgr::get_tenant(tenant_id, true).map_err(ApiError::NotFound)?;
|
||||
let timeline = repo
|
||||
.get_timeline(timeline_id)
|
||||
.with_context(|| format!("No timeline {timeline_id} in repository for tenant {tenant_id}"))
|
||||
.map_err(ApiError::NotFound)?;
|
||||
timeline
|
||||
.checkpoint(CheckpointConfig::Forced)
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -690,12 +870,35 @@ pub fn make_router(
|
||||
}))
|
||||
}
|
||||
|
||||
macro_rules! testing_api {
|
||||
($handler_desc:literal, $handler:path $(,)?) => {{
|
||||
#[cfg(not(feature = "testing"))]
|
||||
async fn cfg_disabled(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
Err(ApiError::BadRequest(anyhow!(concat!(
|
||||
"Cannot ",
|
||||
$handler_desc,
|
||||
" because pageserver was compiled without testing APIs",
|
||||
))))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
let handler = $handler;
|
||||
#[cfg(not(feature = "testing"))]
|
||||
let handler = cfg_disabled;
|
||||
handler
|
||||
}};
|
||||
}
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_index, remote_storage)
|
||||
.context("Failed to initialize router state")?,
|
||||
))
|
||||
.get("/v1/status", status_handler)
|
||||
.put(
|
||||
"/v1/failpoints",
|
||||
testing_api!("manage failpoints", failpoints_handler),
|
||||
)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
@@ -708,6 +911,18 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
|
||||
testing_api!("run timeline GC", timeline_gc_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
|
||||
testing_api!("run timeline compaction", timeline_compact_handler),
|
||||
)
|
||||
.put(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
|
||||
testing_api!("run timeline checkpoint", timeline_checkpoint_handler),
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_delete_handler,
|
||||
|
||||
@@ -16,11 +16,13 @@ use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use postgres_ffi::v14::relfile_utils::*;
|
||||
use postgres_ffi::v14::waldecoder::*;
|
||||
use postgres_ffi::v14::xlog_utils::*;
|
||||
use postgres_ffi::v14::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::ControlFileData;
|
||||
use postgres_ffi::DBState_DB_SHUTDOWNED;
|
||||
use postgres_ffi::Oid;
|
||||
use postgres_ffi::XLogFileName;
|
||||
use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
@@ -236,7 +238,7 @@ fn import_slru<Reader: Read>(
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal(walpath: &Path, tline: &Timeline, startpoint: Lsn, endpoint: Lsn) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
|
||||
|
||||
let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
|
||||
@@ -354,7 +356,7 @@ pub fn import_wal_from_tar<Reader: Read>(
|
||||
end_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Set up walingest mutable state
|
||||
let mut waldecoder = WalStreamDecoder::new(start_lsn);
|
||||
let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
|
||||
let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
|
||||
let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = start_lsn;
|
||||
@@ -439,7 +441,7 @@ fn import_file<Reader: Read>(
|
||||
len: usize,
|
||||
) -> Result<Option<ControlFileData>> {
|
||||
if file_path.starts_with("global") {
|
||||
let spcnode = pg_constants::GLOBALTABLESPACE_OID;
|
||||
let spcnode = postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
let dbnode = 0;
|
||||
|
||||
match file_path
|
||||
@@ -467,7 +469,7 @@ fn import_file<Reader: Read>(
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
debug!("ignored");
|
||||
debug!("ignored PG_VERSION file");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||
@@ -495,7 +497,7 @@ fn import_file<Reader: Read>(
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
debug!("ignored");
|
||||
debug!("ignored PG_VERSION file");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||
|
||||
@@ -33,11 +33,15 @@ use crate::task_mgr::TaskKind;
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
/// This is embedded in the metadata file, and also in the header of all the
|
||||
/// layer files. If you make any backwards-incompatible changes to the storage
|
||||
/// This is embedded in the header of all the layer files.
|
||||
/// If you make any backwards-incompatible changes to the storage
|
||||
/// format, bump this!
|
||||
/// Note that TimelineMetadata uses its own version number to track
|
||||
/// backwards-compatible changes to the metadata format.
|
||||
pub const STORAGE_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
pub const DEFAULT_PG_VERSION: u32 = 14;
|
||||
|
||||
// Magic constants used to identify different kinds of files
|
||||
pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
|
||||
pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
@@ -106,7 +110,7 @@ fn exponential_backoff_duration_seconds(n: u32, base_increment: f64, max_seconds
|
||||
}
|
||||
|
||||
/// A newtype to store arbitrary data grouped by tenant and timeline ids.
|
||||
/// One could use [`utils::zid::TenantTimelineId`] for grouping, but that would
|
||||
/// One could use [`utils::id::TenantTimelineId`] for grouping, but that would
|
||||
/// not include the cases where a certain tenant has zero timelines.
|
||||
/// This is sometimes important: a tenant could be registered during initial load from FS,
|
||||
/// even if he has no timelines on disk.
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
use metrics::core::{AtomicU64, GenericCounter};
|
||||
use metrics::{
|
||||
register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
|
||||
register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
|
||||
IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
|
||||
register_gauge_vec, register_histogram, register_histogram_vec, register_int_counter,
|
||||
register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
|
||||
GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge,
|
||||
UIntGaugeVec,
|
||||
};
|
||||
use once_cell::sync::Lazy;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
@@ -204,12 +205,34 @@ pub static REMAINING_SYNC_ITEMS: Lazy<IntGauge> = Lazy::new(|| {
|
||||
.expect("failed to register pageserver remote storage remaining sync items int gauge")
|
||||
});
|
||||
|
||||
pub static IMAGE_SYNC_TIME: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
pub static IMAGE_SYNC_TIME: Lazy<GaugeVec> = Lazy::new(|| {
|
||||
register_gauge_vec!(
|
||||
"pageserver_remote_storage_image_sync_duration",
|
||||
"Time spent to synchronize (up/download) a whole pageserver image",
|
||||
&["tenant_id", "timeline_id"],
|
||||
)
|
||||
.expect("failed to register per-timeline pageserver image sync time vec")
|
||||
});
|
||||
|
||||
pub static IMAGE_SYNC_OPERATION_KINDS: &[&str] = &["upload", "download", "delete"];
|
||||
pub static IMAGE_SYNC_STATUS: &[&str] = &["success", "failure", "abort"];
|
||||
|
||||
pub static IMAGE_SYNC_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_remote_storage_image_sync_count",
|
||||
"Number of synchronization operations executed for pageserver images. \
|
||||
Grouped by tenant, timeline, operation_kind and status",
|
||||
&["tenant_id", "timeline_id", "operation_kind", "status"]
|
||||
)
|
||||
.expect("failed to register pageserver image sync count vec")
|
||||
});
|
||||
|
||||
pub static IMAGE_SYNC_TIME_HISTOGRAM: Lazy<HistogramVec> = Lazy::new(|| {
|
||||
register_histogram_vec!(
|
||||
"pageserver_remote_storage_image_sync_seconds",
|
||||
"Time took to synchronize (download or upload) a whole pageserver image. \
|
||||
Grouped by tenant and timeline ids, `operation_kind` (upload|download) and `status` (success|failure)",
|
||||
&["tenant_id", "timeline_id", "operation_kind", "status"],
|
||||
Grouped by operation_kind and status",
|
||||
&["operation_kind", "status"],
|
||||
vec![0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 3.0, 10.0, 20.0]
|
||||
)
|
||||
.expect("failed to register pageserver image sync time histogram vec")
|
||||
@@ -256,7 +279,7 @@ macro_rules! redo_histogram_time_buckets {
|
||||
() => {
|
||||
vec![
|
||||
0.000_005, 0.000_010, 0.000_025, 0.000_050, 0.000_100, 0.000_250, 0.000_500, 0.001_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000,
|
||||
0.002_500, 0.005_000, 0.010_000, 0.025_000, 0.050_000, 0.100_000, 0.250_000,
|
||||
]
|
||||
};
|
||||
}
|
||||
@@ -411,6 +434,14 @@ impl Drop for TimelineMetrics {
|
||||
for op in SMGR_QUERY_TIME_OPERATIONS {
|
||||
let _ = SMGR_QUERY_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
|
||||
}
|
||||
|
||||
for op in IMAGE_SYNC_OPERATION_KINDS {
|
||||
for status in IMAGE_SYNC_STATUS {
|
||||
let _ = IMAGE_SYNC_COUNT.remove_label_values(&[tenant_id, timeline_id, op, status]);
|
||||
}
|
||||
}
|
||||
|
||||
let _ = IMAGE_SYNC_TIME.remove_label_values(&[tenant_id, timeline_id]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
postgres_backend_async::{self, PostgresBackend},
|
||||
pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC},
|
||||
pq_proto::{BeMessage, FeMessage, RowDescriptor},
|
||||
simple_rcu::RcuReadGuard,
|
||||
};
|
||||
|
||||
@@ -46,9 +46,9 @@ use crate::tenant::Timeline;
|
||||
use crate::tenant_mgr;
|
||||
use crate::trace::Tracer;
|
||||
use crate::CheckpointConfig;
|
||||
use postgres_ffi::v14::xlog_utils::to_pg_timestamp;
|
||||
|
||||
use postgres_ffi::v14::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -569,12 +569,16 @@ impl PageServerHandler {
|
||||
timeline_id: TimelineId,
|
||||
base_lsn: Lsn,
|
||||
_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let timeline = tenant_mgr::get_tenant(tenant_id, true)?
|
||||
.create_empty_timeline(timeline_id, base_lsn)?;
|
||||
let timeline = tenant_mgr::get_tenant(tenant_id, true)?.create_empty_timeline(
|
||||
timeline_id,
|
||||
base_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
// We might have some wal to import as well, and we should prevent compute
|
||||
@@ -734,7 +738,7 @@ impl PageServerHandler {
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
#[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
async fn handle_get_rel_exists_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
@@ -751,7 +755,7 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip(timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
|
||||
async fn handle_get_nblocks_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
@@ -768,7 +772,7 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip(timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
|
||||
async fn handle_db_size_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
@@ -788,7 +792,7 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
#[instrument(skip(timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
|
||||
#[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
|
||||
async fn handle_get_page_at_lsn_request(
|
||||
&self,
|
||||
timeline: &Timeline,
|
||||
@@ -1026,19 +1030,27 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
// 1. Get start/end LSN from backup_manifest file
|
||||
// 2. Run:
|
||||
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN"
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
|
||||
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
ensure!(params.len() == 4);
|
||||
ensure!(params.len() == 5);
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
let timeline_id = TimelineId::from_str(params[1])?;
|
||||
let base_lsn = Lsn::from_str(params[2])?;
|
||||
let end_lsn = Lsn::from_str(params[3])?;
|
||||
let pg_version = u32::from_str(params[4])?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
match self
|
||||
.handle_import_basebackup(pgb, tenant_id, timeline_id, base_lsn, end_lsn)
|
||||
.handle_import_basebackup(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
base_lsn,
|
||||
end_lsn,
|
||||
pg_version,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
@@ -1076,37 +1088,15 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("failpoints ") {
|
||||
ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support");
|
||||
|
||||
let (_, failpoints) = query_string.split_at("failpoints ".len());
|
||||
|
||||
for failpoint in failpoints.split(';') {
|
||||
if let Some((name, actions)) = failpoint.split_once('=') {
|
||||
info!("cfg failpoint: {} {}", name, actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
if actions == "exit" {
|
||||
fail::cfg_callback(name, || {
|
||||
info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
})
|
||||
.unwrap();
|
||||
} else {
|
||||
fail::cfg(name, actions).unwrap();
|
||||
}
|
||||
} else {
|
||||
bail!("Invalid failpoints format");
|
||||
}
|
||||
}
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("show ") {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
ensure!(params.len() == 1, "invalid param number for config command");
|
||||
let tenant_id = TenantId::from_str(params[0])?;
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
pgb.write_message(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
@@ -1143,91 +1133,6 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("do_gc ") {
|
||||
// Run GC immediately on given timeline.
|
||||
// FIXME: This is just for tests. See test_runner/regress/test_gc.py.
|
||||
// This probably should require special authentication or a global flag to
|
||||
// enable, I don't think we want to or need to allow regular clients to invoke
|
||||
// GC.
|
||||
|
||||
// do_gc <tenant_id> <timeline_id> <gc_horizon>
|
||||
let re = Regex::new(r"^do_gc ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)([[:digit:]]+)?")
|
||||
.unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(query_string)
|
||||
.with_context(|| format!("invalid do_gc: '{}'", query_string))?;
|
||||
|
||||
let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
|
||||
let gc_horizon: u64 = caps
|
||||
.get(4)
|
||||
.map(|h| h.as_str().parse())
|
||||
.unwrap_or_else(|| Ok(tenant.get_gc_horizon()))?;
|
||||
|
||||
// Use tenant's pitr setting
|
||||
let pitr = tenant.get_pitr_interval();
|
||||
let result = tenant.gc_iteration(Some(timeline_id), gc_horizon, pitr, true)?;
|
||||
pgb.write_message(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layers_total"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_pitr"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layers_not_updated"),
|
||||
RowDescriptor::int8_col(b"layers_removed"),
|
||||
RowDescriptor::int8_col(b"elapsed"),
|
||||
]))?
|
||||
.write_message(&BeMessage::DataRow(&[
|
||||
Some(result.layers_total.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_pitr.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_branches.to_string().as_bytes()),
|
||||
Some(result.layers_not_updated.to_string().as_bytes()),
|
||||
Some(result.layers_removed.to_string().as_bytes()),
|
||||
Some(result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("compact ") {
|
||||
// Run compaction immediately on given timeline.
|
||||
// FIXME This is just for tests. Don't expect this to be exposed to
|
||||
// the users or the api.
|
||||
|
||||
// compact <tenant_id> <timeline_id>
|
||||
let re = Regex::new(r"^compact ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(query_string)
|
||||
.with_context(|| format!("Invalid compact: '{}'", query_string))?;
|
||||
|
||||
let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
timeline.compact()?;
|
||||
|
||||
pgb.write_message(&SINGLE_COL_ROWDESC)?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("checkpoint ") {
|
||||
// Run checkpoint immediately on given timeline.
|
||||
|
||||
// checkpoint <tenant_id> <timeline_id>
|
||||
let re = Regex::new(r"^checkpoint ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(query_string)
|
||||
.with_context(|| format!("invalid checkpoint command: '{}'", query_string))?;
|
||||
|
||||
let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
|
||||
// Checkpoint the timeline and also compact it (due to `CheckpointConfig::Forced`).
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
pgb.write_message(&SINGLE_COL_ROWDESC)?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("get_lsn_by_timestamp ") {
|
||||
// Locate LSN of last transaction with timestamp less or equal than sppecified
|
||||
// TODO lazy static
|
||||
@@ -1236,14 +1141,14 @@ impl postgres_backend_async::Handler for PageServerHandler {
|
||||
let caps = re
|
||||
.captures(query_string)
|
||||
.with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?;
|
||||
|
||||
let tenant_id = TenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timeline_id = TimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
|
||||
let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?;
|
||||
let timestamp_pg = to_pg_timestamp(timestamp);
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
let timeline = get_local_timeline(tenant_id, timeline_id)?;
|
||||
pgb.write_message(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"lsn",
|
||||
)]))?;
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::tenant::Timeline;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -125,8 +125,7 @@ impl Timeline {
|
||||
return Ok(nblocks);
|
||||
}
|
||||
|
||||
if (tag.forknum == pg_constants::FSM_FORKNUM
|
||||
|| tag.forknum == pg_constants::VISIBILITYMAP_FORKNUM)
|
||||
if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
|
||||
&& !self.get_rel_exists(tag, lsn, latest)?
|
||||
{
|
||||
// FIXME: Postgres sometimes calls smgrcreate() to create
|
||||
@@ -1090,6 +1089,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
// 03 misc
|
||||
// controlfile
|
||||
// checkpoint
|
||||
// pg_version
|
||||
//
|
||||
// Below is a full list of the keyspace allocation:
|
||||
//
|
||||
@@ -1128,7 +1128,6 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
//
|
||||
// Checkpoint:
|
||||
// 03 00000000 00000000 00000000 00 00000001
|
||||
|
||||
//-- Section 01: relation data and metadata
|
||||
|
||||
const DBDIR_KEY: Key = Key {
|
||||
@@ -1402,8 +1401,9 @@ fn is_slru_block_key(key: Key) -> bool {
|
||||
pub fn create_test_timeline(
|
||||
tenant: &crate::tenant::Tenant,
|
||||
timeline_id: utils::id::TimelineId,
|
||||
pg_version: u32,
|
||||
) -> Result<std::sync::Arc<Timeline>> {
|
||||
let tline = tenant.create_empty_timeline(timeline_id, Lsn(8))?;
|
||||
let tline = tenant.create_empty_timeline(timeline_id, Lsn(8), pg_version)?;
|
||||
let mut m = tline.begin_modification(Lsn(8));
|
||||
m.init_empty()?;
|
||||
m.commit()?;
|
||||
|
||||
@@ -2,8 +2,8 @@ use serde::{Deserialize, Serialize};
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::v14::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::pg_constants::GLOBALTABLESPACE_OID;
|
||||
use postgres_ffi::relfile_utils::forknumber_to_name;
|
||||
use postgres_ffi::Oid;
|
||||
|
||||
///
|
||||
@@ -78,7 +78,7 @@ impl fmt::Display for RelTag {
|
||||
|
||||
impl RelTag {
|
||||
pub fn to_segfile_name(&self, segno: u32) -> String {
|
||||
let mut name = if self.spcnode == pg_constants::GLOBALTABLESPACE_OID {
|
||||
let mut name = if self.spcnode == GLOBALTABLESPACE_OID {
|
||||
"global/".to_string()
|
||||
} else {
|
||||
format!("base/{}/", self.dbnode)
|
||||
|
||||
@@ -24,6 +24,19 @@ pub struct Key {
|
||||
pub const KEY_SIZE: usize = 18;
|
||||
|
||||
impl Key {
|
||||
/// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
|
||||
/// As long as Neon does not support tablespace (because of lack of access to local file system),
|
||||
/// we can assume that only some predefined namespace OIDs are used which can fit in u16
|
||||
pub fn to_i128(&self) -> i128 {
|
||||
assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
|
||||
(((self.field1 & 0xf) as i128) << 120)
|
||||
| (((self.field2 & 0xFFFF) as i128) << 104)
|
||||
| ((self.field3 as i128) << 72)
|
||||
| ((self.field4 as i128) << 40)
|
||||
| ((self.field5 as i128) << 32)
|
||||
| self.field6 as i128
|
||||
}
|
||||
|
||||
pub fn next(&self) -> Key {
|
||||
self.add(1)
|
||||
}
|
||||
@@ -176,7 +189,7 @@ impl Value {
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Serialize)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
@@ -185,9 +198,18 @@ pub struct GcResult {
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
|
||||
@@ -169,13 +169,8 @@ use self::{
|
||||
upload::{upload_index_part, upload_timeline_layers, UploadedTimeline},
|
||||
};
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
exponential_backoff,
|
||||
storage_sync::index::RemoteIndex,
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
task_mgr::BACKGROUND_RUNTIME,
|
||||
tenant::metadata::{metadata_path, TimelineMetadata},
|
||||
config::PageServerConf, exponential_backoff, storage_sync::index::RemoteIndex, task_mgr,
|
||||
task_mgr::TaskKind, task_mgr::BACKGROUND_RUNTIME, tenant::metadata::TimelineMetadata,
|
||||
tenant_mgr::attach_local_tenants,
|
||||
};
|
||||
use crate::{
|
||||
@@ -183,6 +178,7 @@ use crate::{
|
||||
TenantTimelineValues,
|
||||
};
|
||||
|
||||
use crate::metrics::{IMAGE_SYNC_COUNT, IMAGE_SYNC_TIME_HISTOGRAM};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use self::download::download_index_parts;
|
||||
@@ -601,6 +597,7 @@ pub fn spawn_storage_sync_task(
|
||||
|
||||
for (tenant_id, timeline_data) in local_timeline_files.0 {
|
||||
if timeline_data.is_empty() {
|
||||
info!("got empty tenant {}", tenant_id);
|
||||
let _ = empty_tenants.0.entry(tenant_id).or_default();
|
||||
} else {
|
||||
for (timeline_id, timeline_data) in timeline_data {
|
||||
@@ -642,6 +639,7 @@ pub fn spawn_storage_sync_task(
|
||||
(storage, remote_index_clone, sync_queue),
|
||||
max_sync_errors,
|
||||
)
|
||||
.instrument(info_span!("storage_sync_loop"))
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
@@ -839,7 +837,6 @@ async fn process_sync_task_batch(
|
||||
sync_id,
|
||||
upload_data,
|
||||
sync_start,
|
||||
"upload",
|
||||
)
|
||||
.await
|
||||
}
|
||||
@@ -883,7 +880,6 @@ async fn process_sync_task_batch(
|
||||
sync_id,
|
||||
download_data,
|
||||
sync_start,
|
||||
"download",
|
||||
)
|
||||
.await;
|
||||
}
|
||||
@@ -915,7 +911,6 @@ async fn process_sync_task_batch(
|
||||
sync_id,
|
||||
delete_data,
|
||||
sync_start,
|
||||
"delete",
|
||||
)
|
||||
.instrument(info_span!("delete_timeline_data"))
|
||||
.await;
|
||||
@@ -952,8 +947,9 @@ async fn download_timeline_data(
|
||||
sync_id: TenantTimelineId,
|
||||
new_download_data: SyncData<LayersDownload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) -> DownloadStatus {
|
||||
static TASK_NAME: &str = "download";
|
||||
|
||||
match download_timeline_layers(
|
||||
conf,
|
||||
storage,
|
||||
@@ -965,19 +961,19 @@ async fn download_timeline_data(
|
||||
.await
|
||||
{
|
||||
DownloadedTimeline::Abort => {
|
||||
register_sync_status(sync_id, sync_start, task_name, None);
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, None);
|
||||
if let Err(e) = index.write().await.set_awaits_download(&sync_id, false) {
|
||||
error!("Timeline {sync_id} was expected to be in the remote index after a download attempt, but it's absent: {e:?}");
|
||||
}
|
||||
}
|
||||
DownloadedTimeline::FailedAndRescheduled => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
}
|
||||
DownloadedTimeline::Successful(mut download_data) => {
|
||||
match update_local_metadata(conf, sync_id, current_remote_timeline).await {
|
||||
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
||||
Ok(()) => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
||||
return DownloadStatus::Downloaded;
|
||||
}
|
||||
Err(e) => {
|
||||
@@ -988,7 +984,7 @@ async fn download_timeline_data(
|
||||
error!("Failed to update local timeline metadata: {e:?}");
|
||||
download_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Download(download_data));
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1011,7 +1007,7 @@ async fn update_local_metadata(
|
||||
};
|
||||
let remote_lsn = remote_metadata.disk_consistent_lsn();
|
||||
|
||||
let local_metadata_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id);
|
||||
let local_metadata_path = conf.metadata_path(sync_id.timeline_id, sync_id.tenant_id);
|
||||
let local_lsn = if local_metadata_path.exists() {
|
||||
let local_metadata = read_metadata_file(&local_metadata_path)
|
||||
.await
|
||||
@@ -1064,8 +1060,9 @@ async fn delete_timeline_data(
|
||||
sync_id: TenantTimelineId,
|
||||
mut new_delete_data: SyncData<LayersDeletion>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) {
|
||||
static TASK_NAME: &str = "delete";
|
||||
|
||||
let timeline_delete = &mut new_delete_data.data;
|
||||
|
||||
if !timeline_delete.deletion_registered {
|
||||
@@ -1081,14 +1078,14 @@ async fn delete_timeline_data(
|
||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||
new_delete_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Delete(new_delete_data));
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
return;
|
||||
}
|
||||
}
|
||||
timeline_delete.deletion_registered = true;
|
||||
|
||||
let sync_status = delete_timeline_layers(storage, sync_queue, sync_id, new_delete_data).await;
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(sync_status));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(sync_status));
|
||||
}
|
||||
|
||||
async fn read_metadata_file(metadata_path: &Path) -> anyhow::Result<TimelineMetadata> {
|
||||
@@ -1107,8 +1104,8 @@ async fn upload_timeline_data(
|
||||
sync_id: TenantTimelineId,
|
||||
new_upload_data: SyncData<LayersUpload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) -> UploadStatus {
|
||||
static TASK_NAME: &str = "upload";
|
||||
let mut uploaded_data = match upload_timeline_layers(
|
||||
storage,
|
||||
sync_queue,
|
||||
@@ -1119,7 +1116,7 @@ async fn upload_timeline_data(
|
||||
.await
|
||||
{
|
||||
UploadedTimeline::FailedAndRescheduled(e) => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
return UploadStatus::Failed(e);
|
||||
}
|
||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||
@@ -1138,14 +1135,14 @@ async fn upload_timeline_data(
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(true));
|
||||
UploadStatus::Uploaded
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to update remote timeline {sync_id}: {e:?}");
|
||||
uploaded_data.retries += 1;
|
||||
sync_queue.push(sync_id, SyncTask::Upload(uploaded_data));
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(false));
|
||||
register_sync_status(sync_id, sync_start, TASK_NAME, Some(false));
|
||||
UploadStatus::Failed(e)
|
||||
}
|
||||
}
|
||||
@@ -1303,6 +1300,10 @@ fn schedule_first_sync_tasks(
|
||||
None => {
|
||||
// TODO (rodionov) does this mean that we've crashed during tenant creation?
|
||||
// is it safe to upload this checkpoint? could it be half broken?
|
||||
warn!(
|
||||
"marking {} as locally complete, while it doesnt exist in remote index",
|
||||
sync_id
|
||||
);
|
||||
new_sync_tasks.push_back((
|
||||
sync_id,
|
||||
SyncTask::upload(LayersUpload {
|
||||
@@ -1337,6 +1338,8 @@ fn compare_local_and_remote_timeline(
|
||||
local_files: HashSet<PathBuf>,
|
||||
remote_entry: &RemoteTimeline,
|
||||
) -> (LocalTimelineInitStatus, bool) {
|
||||
let _entered = info_span!("compare_local_and_remote_timeline", sync_id = %sync_id).entered();
|
||||
|
||||
let remote_files = remote_entry.stored_files();
|
||||
|
||||
let number_of_layers_to_download = remote_files.difference(&local_files).count();
|
||||
@@ -1347,10 +1350,12 @@ fn compare_local_and_remote_timeline(
|
||||
layers_to_skip: local_files.clone(),
|
||||
}),
|
||||
));
|
||||
info!("NeedsSync");
|
||||
(LocalTimelineInitStatus::NeedsSync, true)
|
||||
// we do not need to manipulate with remote consistent lsn here
|
||||
// because it will be updated when sync will be completed
|
||||
} else {
|
||||
info!("LocallyComplete");
|
||||
(
|
||||
LocalTimelineInitStatus::LocallyComplete(local_metadata.clone()),
|
||||
false,
|
||||
@@ -1387,16 +1392,22 @@ fn register_sync_status(
|
||||
|
||||
let tenant_id = sync_id.tenant_id.to_string();
|
||||
let timeline_id = sync_id.timeline_id.to_string();
|
||||
match sync_status {
|
||||
Some(true) => {
|
||||
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "success"])
|
||||
}
|
||||
Some(false) => {
|
||||
IMAGE_SYNC_TIME.with_label_values(&[&tenant_id, &timeline_id, sync_name, "failure"])
|
||||
}
|
||||
None => return,
|
||||
}
|
||||
.observe(secs_elapsed)
|
||||
|
||||
let sync_status = match sync_status {
|
||||
Some(true) => "success",
|
||||
Some(false) => "failure",
|
||||
None => "abort",
|
||||
};
|
||||
|
||||
IMAGE_SYNC_TIME_HISTOGRAM
|
||||
.with_label_values(&[sync_name, sync_status])
|
||||
.observe(secs_elapsed);
|
||||
IMAGE_SYNC_TIME
|
||||
.with_label_values(&[&tenant_id, &timeline_id])
|
||||
.add(secs_elapsed);
|
||||
IMAGE_SYNC_COUNT
|
||||
.with_label_values(&[&tenant_id, &timeline_id, sync_name, sync_status])
|
||||
.inc();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -1424,7 +1435,7 @@ mod test_utils {
|
||||
}
|
||||
|
||||
fs::write(
|
||||
metadata_path(harness.conf, timeline_id, harness.tenant_id),
|
||||
harness.conf.metadata_path(timeline_id, harness.tenant_id),
|
||||
metadata.to_bytes()?,
|
||||
)
|
||||
.await?;
|
||||
@@ -1441,7 +1452,17 @@ mod test_utils {
|
||||
}
|
||||
|
||||
pub(super) fn dummy_metadata(disk_consistent_lsn: Lsn) -> TimelineMetadata {
|
||||
TimelineMetadata::new(disk_consistent_lsn, None, None, Lsn(0), Lsn(0), Lsn(0))
|
||||
TimelineMetadata::new(
|
||||
disk_consistent_lsn,
|
||||
None,
|
||||
None,
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
// Any version will do
|
||||
// but it should be consistent with the one in the tests
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -9,18 +9,18 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use remote_storage::{path_with_suffix_extension, DownloadError, GenericRemoteStorage};
|
||||
use remote_storage::{DownloadError, GenericRemoteStorage};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncWriteExt},
|
||||
};
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path,
|
||||
TEMP_FILE_SUFFIX,
|
||||
use crate::{config::PageServerConf, storage_sync::SyncTask, TEMP_FILE_SUFFIX};
|
||||
use utils::{
|
||||
crashsafe_dir::path_with_suffix_extension,
|
||||
id::{TenantId, TenantTimelineId, TimelineId},
|
||||
};
|
||||
use utils::id::{TenantId, TenantTimelineId, TimelineId};
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
@@ -137,7 +137,8 @@ async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
sync_id: TenantTimelineId,
|
||||
) -> Result<IndexPart, DownloadError> {
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
let index_part_path = conf
|
||||
.metadata_path(sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let mut index_part_download = storage
|
||||
.download_storage_object(None, &index_part_path)
|
||||
@@ -620,9 +621,10 @@ mod tests {
|
||||
metadata.to_bytes()?,
|
||||
);
|
||||
|
||||
let local_index_part_path =
|
||||
metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let local_index_part_path = harness
|
||||
.conf
|
||||
.metadata_path(sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
let index_part_remote_id = local_storage.remote_object_id(&local_index_part_path)?;
|
||||
let index_part_local_path = PathBuf::from(index_part_remote_id.to_string());
|
||||
fs::create_dir_all(index_part_local_path.parent().unwrap()).await?;
|
||||
|
||||
@@ -341,13 +341,21 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion() {
|
||||
let harness = TenantHarness::create("index_part_conversion").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
let metadata = TimelineMetadata::new(
|
||||
Lsn(5).align(),
|
||||
Some(Lsn(4)),
|
||||
None,
|
||||
Lsn(3),
|
||||
Lsn(2),
|
||||
Lsn(1),
|
||||
DEFAULT_PG_VERSION,
|
||||
);
|
||||
let remote_timeline = RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
timeline_path.join("layer_1"),
|
||||
@@ -464,8 +472,15 @@ mod tests {
|
||||
fn index_part_conversion_negatives() {
|
||||
let harness = TenantHarness::create("index_part_conversion_negatives").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
let metadata = TimelineMetadata::new(
|
||||
Lsn(5).align(),
|
||||
Some(Lsn(4)),
|
||||
None,
|
||||
Lsn(3),
|
||||
Lsn(2),
|
||||
Lsn(1),
|
||||
DEFAULT_PG_VERSION,
|
||||
);
|
||||
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
|
||||
@@ -15,7 +15,7 @@ use super::{
|
||||
LayersUpload, SyncData, SyncQueue,
|
||||
};
|
||||
use crate::metrics::NO_LAYERS_UPLOAD;
|
||||
use crate::{config::PageServerConf, storage_sync::SyncTask, tenant::metadata::metadata_path};
|
||||
use crate::{config::PageServerConf, storage_sync::SyncTask};
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part(
|
||||
@@ -29,7 +29,8 @@ pub(super) async fn upload_index_part(
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
let index_part_path = conf
|
||||
.metadata_path(sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME);
|
||||
storage
|
||||
.upload_storage_object(
|
||||
|
||||
@@ -14,9 +14,9 @@
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use tokio::sync::watch;
|
||||
use tracing::*;
|
||||
use utils::crashsafe_dir::path_with_suffix_extension;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::collections::hash_map;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashMap;
|
||||
@@ -27,6 +27,8 @@ use std::io::Write;
|
||||
use std::num::NonZeroU64;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
use std::process::Stdio;
|
||||
use std::sync::Arc;
|
||||
use std::sync::MutexGuard;
|
||||
use std::sync::{Mutex, RwLock};
|
||||
@@ -34,16 +36,16 @@ use std::time::{Duration, Instant};
|
||||
|
||||
use self::metadata::TimelineMetadata;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::metrics::remove_tenant_metrics;
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
use crate::metrics::STORAGE_TIME;
|
||||
use crate::import_datadir;
|
||||
use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
|
||||
use crate::repository::GcResult;
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::task_mgr;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{CheckpointConfig, TEMP_FILE_SUFFIX};
|
||||
pub use pageserver_api::models::TenantState;
|
||||
|
||||
use toml_edit;
|
||||
use utils::{
|
||||
@@ -117,22 +119,14 @@ pub struct Tenant {
|
||||
upload_layers: bool,
|
||||
}
|
||||
|
||||
/// A state of a tenant in pageserver's memory.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
pub enum TenantState {
|
||||
/// Tenant is fully operational, its background jobs might be running or not.
|
||||
Active { background_jobs_running: bool },
|
||||
/// A tenant is recognized by pageserver, but not yet ready to operate:
|
||||
/// e.g. not present locally and being downloaded or being read into memory from the file system.
|
||||
Paused,
|
||||
/// A tenant is recognized by the pageserver, but no longer used for any operations, as failed to get activated.
|
||||
Broken,
|
||||
}
|
||||
|
||||
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
impl Tenant {
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
pub fn tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
}
|
||||
|
||||
/// Get Timeline handle for given Neon timeline ID.
|
||||
/// This function is idempotent. It doesn't change internal state in any way.
|
||||
pub fn get_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<Arc<Timeline>> {
|
||||
self.timelines
|
||||
@@ -142,8 +136,7 @@ impl Tenant {
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timeline {} was not found for tenant {}",
|
||||
timeline_id,
|
||||
self.tenant_id()
|
||||
timeline_id, self.tenant_id
|
||||
)
|
||||
})
|
||||
.map(Arc::clone)
|
||||
@@ -166,6 +159,7 @@ impl Tenant {
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// XXX: keep the lock to avoid races during timeline creation
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
@@ -180,122 +174,84 @@ impl Tenant {
|
||||
bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.")
|
||||
}
|
||||
|
||||
// Create the timeline directory, and write initial metadata to file.
|
||||
crashsafe_dir::create_dir_all(timeline_path)?;
|
||||
|
||||
let new_metadata =
|
||||
TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
|
||||
save_metadata(
|
||||
self.conf,
|
||||
new_timeline_id,
|
||||
self.tenant_id,
|
||||
&new_metadata,
|
||||
true,
|
||||
)?;
|
||||
|
||||
let new_metadata = TimelineMetadata::new(
|
||||
Lsn(0),
|
||||
None,
|
||||
None,
|
||||
Lsn(0),
|
||||
initdb_lsn,
|
||||
initdb_lsn,
|
||||
pg_version,
|
||||
);
|
||||
let new_timeline =
|
||||
self.initialize_new_timeline(new_timeline_id, new_metadata, &mut timelines)?;
|
||||
self.create_initialized_timeline(new_timeline_id, new_metadata, &mut timelines)?;
|
||||
new_timeline.layers.write().unwrap().next_open_layer_at = Some(initdb_lsn);
|
||||
|
||||
if let hash_map::Entry::Vacant(v) = timelines.entry(new_timeline_id) {
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
}
|
||||
|
||||
Ok(new_timeline)
|
||||
}
|
||||
|
||||
/// Branch a timeline
|
||||
pub fn branch_timeline(
|
||||
/// Create a new timeline.
|
||||
///
|
||||
/// Returns the new timeline ID and reference to its Timeline object.
|
||||
///
|
||||
/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
|
||||
/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
|
||||
/// a new unique ID is generated.
|
||||
pub async fn create_timeline(
|
||||
&self,
|
||||
src: TimelineId,
|
||||
dst: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
|
||||
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC
|
||||
// concurrently removes data that is needed by the new timeline.
|
||||
let _gc_cs = self.gc_cs.lock().unwrap();
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
pg_version: u32,
|
||||
) -> Result<Option<Arc<Timeline>>> {
|
||||
let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate);
|
||||
|
||||
// In order for the branch creation task to not wait for GC/compaction,
|
||||
// we need to make sure that the starting LSN of the child branch is not out of scope midway by
|
||||
//
|
||||
// 1. holding the GC lock to prevent overwritting timeline's GC data
|
||||
// 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline
|
||||
//
|
||||
// Step 2 is to avoid initializing the new branch using data removed by past GC iterations
|
||||
// or in-queue GC iterations.
|
||||
|
||||
// XXX: keep the lock to avoid races during timeline creation
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let src_timeline = timelines
|
||||
.get(&src)
|
||||
// message about timeline being remote is one .context up in the stack
|
||||
.ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?;
|
||||
|
||||
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
|
||||
|
||||
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
|
||||
let start_lsn = start_lsn.unwrap_or_else(|| {
|
||||
let lsn = src_timeline.get_last_record_lsn();
|
||||
info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
|
||||
lsn
|
||||
});
|
||||
|
||||
// Check if the starting LSN is out of scope because it is less than
|
||||
// 1. the latest GC cutoff LSN or
|
||||
// 2. the planned GC cutoff LSN, which is from an in-queue GC iteration.
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn,
|
||||
))?;
|
||||
if self
|
||||
.conf
|
||||
.timeline_path(&new_timeline_id, &self.tenant_id)
|
||||
.exists()
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
|
||||
if start_lsn < cutoff {
|
||||
bail!(format!(
|
||||
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
|
||||
));
|
||||
}
|
||||
debug!("timeline {new_timeline_id} already exists");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// Determine prev-LSN for the new timeline. We can only determine it if
|
||||
// the timeline was branched at the current end of the source timeline.
|
||||
let RecordLsn {
|
||||
last: src_last,
|
||||
prev: src_prev,
|
||||
} = src_timeline.get_last_record_rlsn();
|
||||
let dst_prev = if src_last == start_lsn {
|
||||
Some(src_prev)
|
||||
} else {
|
||||
None
|
||||
let loaded_timeline = match ancestor_timeline_id {
|
||||
Some(ancestor_timeline_id) => {
|
||||
let ancestor_timeline = self
|
||||
.get_timeline(ancestor_timeline_id)
|
||||
.context("Cannot branch off the timeline that's not present in pageserver")?;
|
||||
|
||||
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||
// to the requested branch point. The repository code itself doesn't
|
||||
// require it, but if we start to receive WAL on the new timeline,
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
*lsn = lsn.align();
|
||||
ancestor_timeline.wait_lsn(*lsn).await?;
|
||||
|
||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||
if ancestor_ancestor_lsn > *lsn {
|
||||
// can we safely just branch from the ancestor instead?
|
||||
bail!(
|
||||
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
|
||||
lsn,
|
||||
ancestor_timeline_id,
|
||||
ancestor_ancestor_lsn,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
|
||||
}
|
||||
None => self.bootstrap_timeline(new_timeline_id, pg_version)?,
|
||||
};
|
||||
|
||||
// create a new timeline directory
|
||||
let timelinedir = self.conf.timeline_path(&dst, &self.tenant_id);
|
||||
crashsafe_dir::create_dir(&timelinedir)?;
|
||||
// Have added new timeline into the tenant, now its background tasks are needed.
|
||||
self.activate(true);
|
||||
|
||||
// Create the metadata file, noting the ancestor of the new timeline.
|
||||
// There is initially no data in it, but all the read-calls know to look
|
||||
// into the ancestor.
|
||||
let metadata = TimelineMetadata::new(
|
||||
start_lsn,
|
||||
dst_prev,
|
||||
Some(src),
|
||||
start_lsn,
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(),
|
||||
src_timeline.initdb_lsn,
|
||||
);
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenant_id))?;
|
||||
save_metadata(self.conf, dst, self.tenant_id, &metadata, true)?;
|
||||
|
||||
let new_timeline = self.initialize_new_timeline(dst, metadata, &mut timelines)?;
|
||||
timelines.insert(dst, Arc::clone(&new_timeline));
|
||||
|
||||
info!("branched timeline {dst} from {src} at {start_lsn}");
|
||||
|
||||
Ok(new_timeline)
|
||||
Ok(Some(loaded_timeline))
|
||||
}
|
||||
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
@@ -342,8 +298,7 @@ impl Tenant {
|
||||
drop(timelines);
|
||||
|
||||
for (timeline_id, timeline) in &timelines_to_compact {
|
||||
let _entered =
|
||||
info_span!("compact", timeline = %timeline_id, tenant = %self.tenant_id).entered();
|
||||
let _entered = info_span!("compact_timeline", timeline = %timeline_id).entered();
|
||||
timeline.compact()?;
|
||||
}
|
||||
|
||||
@@ -429,16 +384,24 @@ impl Tenant {
|
||||
|
||||
let mut timelines_accessor = self.timelines.lock().unwrap();
|
||||
for (timeline_id, metadata) in sorted_timelines {
|
||||
let timeline = self
|
||||
.initialize_new_timeline(timeline_id, metadata, &mut timelines_accessor)
|
||||
.with_context(|| format!("Failed to initialize timeline {timeline_id}"))?;
|
||||
|
||||
match timelines_accessor.entry(timeline.timeline_id) {
|
||||
hash_map::Entry::Occupied(_) => anyhow::bail!(
|
||||
"Found freshly initialized timeline {} in the tenant map",
|
||||
timeline.timeline_id
|
||||
info!(
|
||||
"Attaching timeline {} pg_version {}",
|
||||
timeline_id,
|
||||
metadata.pg_version()
|
||||
);
|
||||
let ancestor = metadata
|
||||
.ancestor_timeline()
|
||||
.and_then(|ancestor_timeline_id| timelines_accessor.get(&ancestor_timeline_id))
|
||||
.cloned();
|
||||
match timelines_accessor.entry(timeline_id) {
|
||||
Entry::Occupied(_) => warn!(
|
||||
"Timeline {}/{} already exists in the tenant map, skipping its initialization",
|
||||
self.tenant_id, timeline_id
|
||||
),
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
Entry::Vacant(v) => {
|
||||
let timeline = self
|
||||
.initialize_new_timeline(timeline_id, metadata, ancestor)
|
||||
.with_context(|| format!("Failed to initialize timeline {timeline_id}"))?;
|
||||
v.insert(timeline);
|
||||
}
|
||||
}
|
||||
@@ -646,24 +609,17 @@ impl Tenant {
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: TimelineMetadata,
|
||||
timelines: &mut MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
ancestor: Option<Arc<Timeline>>,
|
||||
) -> anyhow::Result<Arc<Timeline>> {
|
||||
let ancestor = match new_metadata.ancestor_timeline() {
|
||||
Some(ancestor_timeline_id) => Some(
|
||||
timelines
|
||||
.get(&ancestor_timeline_id)
|
||||
.cloned()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
|
||||
)
|
||||
})?,
|
||||
),
|
||||
None => None,
|
||||
};
|
||||
if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
|
||||
anyhow::ensure!(
|
||||
ancestor.is_some(),
|
||||
"Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
|
||||
)
|
||||
}
|
||||
|
||||
let new_disk_consistent_lsn = new_metadata.disk_consistent_lsn();
|
||||
|
||||
let pg_version = new_metadata.pg_version();
|
||||
let new_timeline = Arc::new(Timeline::new(
|
||||
self.conf,
|
||||
Arc::clone(&self.tenant_conf),
|
||||
@@ -673,6 +629,7 @@ impl Tenant {
|
||||
self.tenant_id,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
self.upload_layers,
|
||||
pg_version,
|
||||
));
|
||||
|
||||
new_timeline
|
||||
@@ -711,7 +668,7 @@ impl Tenant {
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<TenantConfOpt> {
|
||||
let target_config_path = TenantConf::path(conf, tenant_id);
|
||||
let target_config_path = conf.tenant_config_path(tenant_id);
|
||||
let target_config_display = target_config_path.display();
|
||||
|
||||
info!("loading tenantconf from {target_config_display}");
|
||||
@@ -803,7 +760,7 @@ impl Tenant {
|
||||
})
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to fsync on firts save for config {}",
|
||||
"Failed to fsync on first save for config {}",
|
||||
target_config_path.display()
|
||||
)
|
||||
})?;
|
||||
@@ -843,9 +800,6 @@ impl Tenant {
|
||||
pitr: Duration,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult> {
|
||||
let _span_guard =
|
||||
info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id)
|
||||
.entered();
|
||||
let mut totals: GcResult = Default::default();
|
||||
let now = Instant::now();
|
||||
|
||||
@@ -960,9 +914,220 @@ impl Tenant {
|
||||
Ok(totals)
|
||||
}
|
||||
|
||||
pub fn tenant_id(&self) -> TenantId {
|
||||
self.tenant_id
|
||||
fn branch_timeline(
|
||||
&self,
|
||||
src: TimelineId,
|
||||
dst: TimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
|
||||
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC
|
||||
// concurrently removes data that is needed by the new timeline.
|
||||
let _gc_cs = self.gc_cs.lock().unwrap();
|
||||
|
||||
// In order for the branch creation task to not wait for GC/compaction,
|
||||
// we need to make sure that the starting LSN of the child branch is not out of scope midway by
|
||||
//
|
||||
// 1. holding the GC lock to prevent overwritting timeline's GC data
|
||||
// 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline
|
||||
//
|
||||
// Step 2 is to avoid initializing the new branch using data removed by past GC iterations
|
||||
// or in-queue GC iterations.
|
||||
|
||||
// XXX: keep the lock to avoid races during timeline creation
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let src_timeline = timelines
|
||||
.get(&src)
|
||||
// message about timeline being remote is one .context up in the stack
|
||||
.ok_or_else(|| anyhow::anyhow!("unknown timeline id: {src}"))?;
|
||||
|
||||
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
|
||||
|
||||
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
|
||||
let start_lsn = start_lsn.unwrap_or_else(|| {
|
||||
let lsn = src_timeline.get_last_record_lsn();
|
||||
info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
|
||||
lsn
|
||||
});
|
||||
|
||||
// Check if the starting LSN is out of scope because it is less than
|
||||
// 1. the latest GC cutoff LSN or
|
||||
// 2. the planned GC cutoff LSN, which is from an in-queue GC iteration.
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||
.context(format!(
|
||||
"invalid branch start lsn: less than latest GC cutoff {}",
|
||||
*latest_gc_cutoff_lsn,
|
||||
))?;
|
||||
{
|
||||
let gc_info = src_timeline.gc_info.read().unwrap();
|
||||
let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
|
||||
if start_lsn < cutoff {
|
||||
bail!(format!(
|
||||
"invalid branch start lsn: less than planned GC cutoff {cutoff}"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
// Determine prev-LSN for the new timeline. We can only determine it if
|
||||
// the timeline was branched at the current end of the source timeline.
|
||||
let RecordLsn {
|
||||
last: src_last,
|
||||
prev: src_prev,
|
||||
} = src_timeline.get_last_record_rlsn();
|
||||
let dst_prev = if src_last == start_lsn {
|
||||
Some(src_prev)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
// Create the metadata file, noting the ancestor of the new timeline.
|
||||
// There is initially no data in it, but all the read-calls know to look
|
||||
// into the ancestor.
|
||||
let metadata = TimelineMetadata::new(
|
||||
start_lsn,
|
||||
dst_prev,
|
||||
Some(src),
|
||||
start_lsn,
|
||||
*src_timeline.latest_gc_cutoff_lsn.read(),
|
||||
src_timeline.initdb_lsn,
|
||||
src_timeline.pg_version,
|
||||
);
|
||||
let new_timeline = self.create_initialized_timeline(dst, metadata, &mut timelines)?;
|
||||
info!("branched timeline {dst} from {src} at {start_lsn}");
|
||||
|
||||
Ok(new_timeline)
|
||||
}
|
||||
|
||||
/// - run initdb to init temporary instance and get bootstrap data
|
||||
/// - after initialization complete, remove the temp dir.
|
||||
fn bootstrap_timeline(
|
||||
&self,
|
||||
timeline_id: TimelineId,
|
||||
pg_version: u32,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
// temporary directory for basebackup files for the given timeline.
|
||||
let initdb_path = path_with_suffix_extension(
|
||||
self.conf
|
||||
.timelines_path(&self.tenant_id)
|
||||
.join(format!("basebackup-{timeline_id}")),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
// Init temporarily repo to get bootstrap data
|
||||
run_initdb(self.conf, &initdb_path, pg_version)?;
|
||||
let pgdata_path = initdb_path;
|
||||
|
||||
let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
// Initdb lsn will be equal to last_record_lsn which will be set after import.
|
||||
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
|
||||
let timeline = self.create_empty_timeline(timeline_id, lsn, pg_version)?;
|
||||
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
info!(
|
||||
"created root timeline {} timeline.lsn {}",
|
||||
timeline_id,
|
||||
timeline.get_last_record_lsn()
|
||||
);
|
||||
|
||||
// Remove temp dir. We don't need it anymore
|
||||
fs::remove_dir_all(pgdata_path)?;
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
fn create_initialized_timeline(
|
||||
&self,
|
||||
new_timeline_id: TimelineId,
|
||||
new_metadata: TimelineMetadata,
|
||||
timelines: &mut MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&new_timeline_id, &self.tenant_id))
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create timeline {}/{} directory",
|
||||
new_timeline_id, self.tenant_id
|
||||
)
|
||||
})?;
|
||||
save_metadata(
|
||||
self.conf,
|
||||
new_timeline_id,
|
||||
self.tenant_id,
|
||||
&new_metadata,
|
||||
true,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create timeline {}/{} metadata",
|
||||
new_timeline_id, self.tenant_id
|
||||
)
|
||||
})?;
|
||||
|
||||
let ancestor = new_metadata
|
||||
.ancestor_timeline()
|
||||
.and_then(|ancestor_timeline_id| timelines.get(&ancestor_timeline_id))
|
||||
.cloned();
|
||||
let new_timeline = self
|
||||
.initialize_new_timeline(new_timeline_id, new_metadata, ancestor)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to initialize timeline {}/{}",
|
||||
new_timeline_id, self.tenant_id
|
||||
)
|
||||
})?;
|
||||
|
||||
match timelines.entry(new_timeline_id) {
|
||||
Entry::Occupied(_) => bail!(
|
||||
"Found freshly initialized timeline {} in the tenant map",
|
||||
new_timeline_id
|
||||
),
|
||||
Entry::Vacant(v) => {
|
||||
v.insert(Arc::clone(&new_timeline));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(new_timeline)
|
||||
}
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
/// to get bootstrap data for timeline initialization.
|
||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path, pg_version: u32) -> Result<()> {
|
||||
info!("running initdb in {}... ", initdbpath.display());
|
||||
|
||||
let initdb_path = conf.pg_bin_dir(pg_version).join("initdb");
|
||||
let initdb_output = Command::new(initdb_path)
|
||||
.args(&["-D", &initdbpath.to_string_lossy()])
|
||||
.args(&["-U", &conf.superuser])
|
||||
.args(&["-E", "utf8"])
|
||||
.arg("--no-instructions")
|
||||
// This is only used for a temporary installation that is deleted shortly after,
|
||||
// so no need to fsync it
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.context("failed to execute initdb")?;
|
||||
if !initdb_output.status.success() {
|
||||
bail!(
|
||||
"initdb failed: '{}'",
|
||||
String::from_utf8_lossy(&initdb_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl Drop for Tenant {
|
||||
@@ -1010,7 +1175,6 @@ pub mod harness {
|
||||
walredo::{WalRedoError, WalRedoManager},
|
||||
};
|
||||
|
||||
use super::metadata::metadata_path;
|
||||
use super::*;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use hex_literal::hex;
|
||||
@@ -1146,7 +1310,7 @@ pub mod harness {
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let metadata_path = metadata_path(conf, timeline_id, tenant_id);
|
||||
let metadata_path = conf.metadata_path(timeline_id, tenant_id);
|
||||
let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata bytes from path {}",
|
||||
@@ -1171,6 +1335,7 @@ pub mod harness {
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} to get to {}, with {} and {} records",
|
||||
@@ -1192,11 +1357,12 @@ pub mod harness {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::metadata::METADATA_FILE_NAME;
|
||||
use super::*;
|
||||
use crate::config::METADATA_FILE_NAME;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::BytesMut;
|
||||
use hex_literal::hex;
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -1208,7 +1374,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_basic() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_basic")?.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
@@ -1230,9 +1396,9 @@ mod tests {
|
||||
#[test]
|
||||
fn no_duplicate_timelines() -> Result<()> {
|
||||
let tenant = TenantHarness::create("no_duplicate_timelines")?.load();
|
||||
let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let _ = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
|
||||
match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) {
|
||||
Ok(_) => panic!("duplicate timeline creation should fail"),
|
||||
Err(e) => assert_eq!(
|
||||
e.to_string(),
|
||||
@@ -1256,7 +1422,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_branch")?.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
let writer = tline.writer();
|
||||
use std::str::from_utf8;
|
||||
|
||||
@@ -1351,7 +1517,7 @@ mod tests {
|
||||
let tenant =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
|
||||
.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
@@ -1381,7 +1547,7 @@ mod tests {
|
||||
let tenant =
|
||||
TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
|
||||
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?;
|
||||
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
||||
match tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
@@ -1407,7 +1573,7 @@ mod tests {
|
||||
RepoHarness::create("test_prohibit_get_for_garbage_collected_data")?
|
||||
.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
@@ -1425,7 +1591,7 @@ mod tests {
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
@@ -1442,7 +1608,7 @@ mod tests {
|
||||
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
|
||||
let tenant =
|
||||
TenantHarness::create("test_parent_keeps_data_forever_after_branching")?.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
tenant.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
@@ -1470,7 +1636,8 @@ mod tests {
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
{
|
||||
let tenant = harness.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
|
||||
let tline =
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x8000))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
}
|
||||
@@ -1490,7 +1657,7 @@ mod tests {
|
||||
// create two timelines
|
||||
{
|
||||
let tenant = harness.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
@@ -1526,7 +1693,7 @@ mod tests {
|
||||
let harness = TenantHarness::create(TEST_NAME)?;
|
||||
let tenant = harness.load();
|
||||
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
drop(tenant);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
@@ -1563,7 +1730,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_images() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_images")?.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
@@ -1613,7 +1780,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_bulk_insert() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_bulk_insert")?.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
let mut lsn = Lsn(0x10);
|
||||
|
||||
@@ -1653,7 +1820,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_random_updates() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_random_updates")?.load();
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
const NUM_KEYS: usize = 1000;
|
||||
|
||||
@@ -1723,7 +1890,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_traverse_branches() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_traverse_branches")?.load();
|
||||
let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
const NUM_KEYS: usize = 1000;
|
||||
|
||||
@@ -1802,7 +1969,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_traverse_ancestors() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_traverse_ancestors")?.load();
|
||||
let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let mut tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?;
|
||||
|
||||
const NUM_KEYS: usize = 100;
|
||||
const NUM_TLINES: usize = 50;
|
||||
|
||||
@@ -713,7 +713,7 @@ impl DeltaLayerWriter {
|
||||
for buf in block_buf.blocks {
|
||||
file.write_all(buf.as_ref())?;
|
||||
}
|
||||
|
||||
assert!(self.lsn_range.start < self.lsn_range.end);
|
||||
// Fill in the summary on blk 0
|
||||
let summary = Summary {
|
||||
magic: DELTA_FILE_MAGIC,
|
||||
|
||||
@@ -15,9 +15,15 @@ use crate::repository::Key;
|
||||
use crate::tenant::inmemory_layer::InMemoryLayer;
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::tenant::storage_layer::{range_eq, range_overlaps};
|
||||
use amplify_num::i256;
|
||||
use anyhow::Result;
|
||||
use num_traits::identities::{One, Zero};
|
||||
use num_traits::{Bounded, Num, Signed};
|
||||
use rstar::{RTree, RTreeObject, AABB};
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -47,14 +53,163 @@ pub struct LayerMap {
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
/// All the historic layers are kept here
|
||||
historic_layers: RTree<LayerRTreeObject>,
|
||||
|
||||
/// TODO: This is a placeholder implementation of a data structure
|
||||
/// to hold information about all the layer files on disk and in
|
||||
/// S3. Currently, it's just a vector and all operations perform a
|
||||
/// linear scan over it. That obviously becomes slow as the
|
||||
/// number of layers grows. I'm imagining that an R-tree or some
|
||||
/// other 2D data structure would be the long-term solution here.
|
||||
historic_layers: Vec<Arc<dyn Layer>>,
|
||||
/// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
|
||||
/// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
|
||||
l0_delta_layers: Vec<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
struct LayerRTreeObject {
|
||||
layer: Arc<dyn Layer>,
|
||||
}
|
||||
|
||||
// Representation of Key as numeric type.
|
||||
// We can not use native implementation of i128, because rstar::RTree
|
||||
// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
|
||||
// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
|
||||
// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
|
||||
// By using i256 as the type, even though all the actual values would fit in i128, we can be
|
||||
// sure that multiplication doesn't overflow.
|
||||
//
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
|
||||
struct IntKey(i256);
|
||||
|
||||
impl Copy for IntKey {}
|
||||
|
||||
impl IntKey {
|
||||
fn from(i: i128) -> Self {
|
||||
IntKey(i256::from(i))
|
||||
}
|
||||
}
|
||||
|
||||
impl Bounded for IntKey {
|
||||
fn min_value() -> Self {
|
||||
IntKey(i256::MIN)
|
||||
}
|
||||
fn max_value() -> Self {
|
||||
IntKey(i256::MAX)
|
||||
}
|
||||
}
|
||||
|
||||
impl Signed for IntKey {
|
||||
fn is_positive(&self) -> bool {
|
||||
self.0 > i256::ZERO
|
||||
}
|
||||
fn is_negative(&self) -> bool {
|
||||
self.0 < i256::ZERO
|
||||
}
|
||||
fn signum(&self) -> Self {
|
||||
match self.0.cmp(&i256::ZERO) {
|
||||
Ordering::Greater => IntKey(i256::ONE),
|
||||
Ordering::Less => IntKey(-i256::ONE),
|
||||
Ordering::Equal => IntKey(i256::ZERO),
|
||||
}
|
||||
}
|
||||
fn abs(&self) -> Self {
|
||||
IntKey(self.0.abs())
|
||||
}
|
||||
fn abs_sub(&self, other: &Self) -> Self {
|
||||
if self.0 <= other.0 {
|
||||
IntKey(i256::ZERO)
|
||||
} else {
|
||||
IntKey(self.0 - other.0)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Neg for IntKey {
|
||||
type Output = Self;
|
||||
fn neg(self) -> Self::Output {
|
||||
IntKey(-self.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Rem for IntKey {
|
||||
type Output = Self;
|
||||
fn rem(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 % rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Div for IntKey {
|
||||
type Output = Self;
|
||||
fn div(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 / rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Add for IntKey {
|
||||
type Output = Self;
|
||||
fn add(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 + rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Sub for IntKey {
|
||||
type Output = Self;
|
||||
fn sub(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 - rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl Mul for IntKey {
|
||||
type Output = Self;
|
||||
fn mul(self, rhs: Self) -> Self::Output {
|
||||
IntKey(self.0 * rhs.0)
|
||||
}
|
||||
}
|
||||
|
||||
impl One for IntKey {
|
||||
fn one() -> Self {
|
||||
IntKey(i256::ONE)
|
||||
}
|
||||
}
|
||||
|
||||
impl Zero for IntKey {
|
||||
fn zero() -> Self {
|
||||
IntKey(i256::ZERO)
|
||||
}
|
||||
fn is_zero(&self) -> bool {
|
||||
self.0 == i256::ZERO
|
||||
}
|
||||
}
|
||||
|
||||
impl Num for IntKey {
|
||||
type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
|
||||
fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
|
||||
Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for LayerRTreeObject {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
Arc::ptr_eq(&self.layer, &other.layer)
|
||||
}
|
||||
}
|
||||
|
||||
impl RTreeObject for LayerRTreeObject {
|
||||
type Envelope = AABB<[IntKey; 2]>;
|
||||
fn envelope(&self) -> Self::Envelope {
|
||||
let key_range = self.layer.get_key_range();
|
||||
let lsn_range = self.layer.get_lsn_range();
|
||||
AABB::from_corners(
|
||||
[
|
||||
IntKey::from(key_range.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Return value of LayerMap::search
|
||||
@@ -80,19 +235,24 @@ impl LayerMap {
|
||||
// Find the latest image layer that covers the given key
|
||||
let mut latest_img: Option<Arc<dyn Layer>> = None;
|
||||
let mut latest_img_lsn: Option<Lsn> = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key.to_i128()), IntKey::from(0i128)],
|
||||
[
|
||||
IntKey::from(key.to_i128()),
|
||||
IntKey::from(end_lsn.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
|
||||
if img_lsn >= end_lsn {
|
||||
// too new
|
||||
continue;
|
||||
}
|
||||
assert!(img_lsn < end_lsn);
|
||||
if Lsn(img_lsn.0 + 1) == end_lsn {
|
||||
// found exact match
|
||||
return Ok(Some(SearchResult {
|
||||
@@ -108,19 +268,24 @@ impl LayerMap {
|
||||
|
||||
// Search the delta layers
|
||||
let mut latest_delta: Option<Arc<dyn Layer>> = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
for e in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
if l.get_lsn_range().start >= end_lsn {
|
||||
// too new
|
||||
continue;
|
||||
info!(
|
||||
"Candidate delta layer {}..{} is too new for lsn {}",
|
||||
l.get_lsn_range().start,
|
||||
l.get_lsn_range().end,
|
||||
end_lsn
|
||||
);
|
||||
}
|
||||
|
||||
assert!(l.get_lsn_range().start < end_lsn);
|
||||
if l.get_lsn_range().end >= end_lsn {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
@@ -170,7 +335,10 @@ impl LayerMap {
|
||||
/// Insert an on-disk layer
|
||||
///
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
self.historic_layers.push(layer);
|
||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||
self.l0_delta_layers.push(layer.clone());
|
||||
}
|
||||
self.historic_layers.insert(LayerRTreeObject { layer });
|
||||
NUM_ONDISK_LAYERS.inc();
|
||||
}
|
||||
|
||||
@@ -180,17 +348,22 @@ impl LayerMap {
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let len_before = self.historic_layers.len();
|
||||
if layer.get_key_range() == (Key::MIN..Key::MAX) {
|
||||
let len_before = self.l0_delta_layers.len();
|
||||
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
self.historic_layers
|
||||
.retain(|other| !Arc::ptr_eq(other, &layer));
|
||||
|
||||
assert_eq!(self.historic_layers.len(), len_before - 1);
|
||||
// FIXME: ptr_eq might fail to return true for 'dyn'
|
||||
// references. Clippy complains about this. In practice it
|
||||
// seems to work, the assertion below would be triggered
|
||||
// otherwise but this ought to be fixed.
|
||||
#[allow(clippy::vtable_address_comparisons)]
|
||||
self.l0_delta_layers
|
||||
.retain(|other| !Arc::ptr_eq(other, &layer));
|
||||
assert_eq!(self.l0_delta_layers.len(), len_before - 1);
|
||||
}
|
||||
assert!(self
|
||||
.historic_layers
|
||||
.remove(&LayerRTreeObject { layer })
|
||||
.is_some());
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
@@ -207,15 +380,26 @@ impl LayerMap {
|
||||
|
||||
loop {
|
||||
let mut made_progress = false;
|
||||
for l in self.historic_layers.iter() {
|
||||
let envelope = AABB::from_corners(
|
||||
[
|
||||
IntKey::from(range_remain.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(range_remain.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
if !l.is_incremental()
|
||||
&& l.get_key_range().contains(&range_remain.start)
|
||||
&& lsn_range.contains(&img_lsn)
|
||||
{
|
||||
if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) {
|
||||
made_progress = true;
|
||||
let img_key_end = l.get_key_range().end;
|
||||
|
||||
@@ -232,8 +416,8 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl Iterator<Item = &Arc<dyn Layer>> {
|
||||
self.historic_layers.iter()
|
||||
pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<dyn Layer>> {
|
||||
self.historic_layers.iter().map(|e| e.layer.clone())
|
||||
}
|
||||
|
||||
/// Find the last image layer that covers 'key', ignoring any image layers
|
||||
@@ -241,19 +425,22 @@ impl LayerMap {
|
||||
fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
let mut candidate_lsn = Lsn(0);
|
||||
let mut candidate = None;
|
||||
for l in self.historic_layers.iter() {
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key.to_i128()), IntKey::from(0)],
|
||||
[IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if !l.get_key_range().contains(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
assert!(l.get_key_range().contains(&key));
|
||||
let this_lsn = l.get_lsn_range().start;
|
||||
if this_lsn > lsn {
|
||||
continue;
|
||||
}
|
||||
assert!(this_lsn <= lsn);
|
||||
if this_lsn < candidate_lsn {
|
||||
// our previous candidate was better
|
||||
continue;
|
||||
@@ -279,10 +466,19 @@ impl LayerMap {
|
||||
lsn: Lsn,
|
||||
) -> Result<Vec<(Range<Key>, Option<Arc<dyn Layer>>)>> {
|
||||
let mut points = vec![key_range.start];
|
||||
for l in self.historic_layers.iter() {
|
||||
if l.get_lsn_range().start > lsn {
|
||||
continue;
|
||||
}
|
||||
let envelope = AABB::from_corners(
|
||||
[IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128()),
|
||||
IntKey::from(lsn.0 as i128),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
assert!(l.get_lsn_range().start <= lsn);
|
||||
let range = l.get_key_range();
|
||||
if key_range.contains(&range.start) {
|
||||
points.push(l.get_key_range().start);
|
||||
@@ -315,16 +511,29 @@ impl LayerMap {
|
||||
/// given key and LSN range.
|
||||
pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
|
||||
let mut result = 0;
|
||||
for l in self.historic_layers.iter() {
|
||||
if lsn_range.start >= lsn_range.end {
|
||||
return Ok(0);
|
||||
}
|
||||
let envelope = AABB::from_corners(
|
||||
[
|
||||
IntKey::from(key_range.start.to_i128()),
|
||||
IntKey::from(lsn_range.start.0 as i128),
|
||||
],
|
||||
[
|
||||
IntKey::from(key_range.end.to_i128() - 1),
|
||||
IntKey::from(lsn_range.end.0 as i128 - 1),
|
||||
],
|
||||
);
|
||||
for e in self
|
||||
.historic_layers
|
||||
.locate_in_envelope_intersecting(&envelope)
|
||||
{
|
||||
let l = &e.layer;
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_lsn_range(), lsn_range) {
|
||||
continue;
|
||||
}
|
||||
if !range_overlaps(&l.get_key_range(), key_range) {
|
||||
continue;
|
||||
}
|
||||
assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
|
||||
assert!(range_overlaps(&l.get_key_range(), key_range));
|
||||
|
||||
// We ignore level0 delta layers. Unless the whole keyspace fits
|
||||
// into one partition
|
||||
@@ -341,17 +550,7 @@ impl LayerMap {
|
||||
|
||||
/// Return all L0 delta layers
|
||||
pub fn get_level0_deltas(&self) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
let mut deltas = Vec::new();
|
||||
for l in self.historic_layers.iter() {
|
||||
if !l.is_incremental() {
|
||||
continue;
|
||||
}
|
||||
if l.get_key_range() != (Key::MIN..Key::MAX) {
|
||||
continue;
|
||||
}
|
||||
deltas.push(Arc::clone(l));
|
||||
}
|
||||
Ok(deltas)
|
||||
Ok(self.l0_delta_layers.clone())
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer map
|
||||
@@ -370,8 +569,8 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
println!("historic_layers:");
|
||||
for layer in self.historic_layers.iter() {
|
||||
layer.dump(verbose)?;
|
||||
for e in self.historic_layers.iter() {
|
||||
e.layer.dump(verbose)?;
|
||||
}
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
|
||||
@@ -8,7 +8,6 @@
|
||||
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -21,7 +20,12 @@ use utils::{
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::STORAGE_FORMAT_VERSION;
|
||||
|
||||
/// Use special format number to enable backward compatibility.
|
||||
const METADATA_FORMAT_VERSION: u16 = 4;
|
||||
|
||||
/// Previous supported format versions.
|
||||
const METADATA_OLD_FORMAT_VERSION: u16 = 3;
|
||||
|
||||
/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic.
|
||||
///
|
||||
@@ -29,28 +33,46 @@ use crate::STORAGE_FORMAT_VERSION;
|
||||
/// see PG_CONTROL_MAX_SAFE_SIZE
|
||||
const METADATA_MAX_SIZE: usize = 512;
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in Timeline.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineMetadata {
|
||||
hdr: TimelineMetadataHeader,
|
||||
body: TimelineMetadataBody,
|
||||
body: TimelineMetadataBodyV2,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataHeader {
|
||||
checksum: u32, // CRC of serialized metadata body
|
||||
size: u16, // size of serialized metadata
|
||||
format_version: u16, // storage format version (used for compatibility checks)
|
||||
format_version: u16, // metadata format version (used for compatibility checks)
|
||||
}
|
||||
const METADATA_HDR_SIZE: usize = std::mem::size_of::<TimelineMetadataHeader>();
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataBody {
|
||||
struct TimelineMetadataBodyV2 {
|
||||
disk_consistent_lsn: Lsn,
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
|
||||
// lot. We only store it in the metadata file when we flush *all* the
|
||||
// in-memory data so that 'last_record_lsn' is the same as
|
||||
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
|
||||
// soon as we reprocess at least one record, we will have a valid
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<TimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
struct TimelineMetadataBodyV1 {
|
||||
disk_consistent_lsn: Lsn,
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
@@ -77,34 +99,63 @@ impl TimelineMetadata {
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> Self {
|
||||
Self {
|
||||
hdr: TimelineMetadataHeader {
|
||||
checksum: 0,
|
||||
size: 0,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
},
|
||||
body: TimelineMetadataBody {
|
||||
body: TimelineMetadataBodyV2 {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
ancestor_timeline,
|
||||
ancestor_lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
initdb_lsn,
|
||||
pg_version,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
|
||||
|
||||
// backward compatible only up to this version
|
||||
ensure!(
|
||||
hdr.format_version == METADATA_OLD_FORMAT_VERSION,
|
||||
"unsupported metadata format version {}",
|
||||
hdr.format_version
|
||||
);
|
||||
|
||||
let metadata_size = hdr.size as usize;
|
||||
|
||||
let body: TimelineMetadataBodyV1 =
|
||||
TimelineMetadataBodyV1::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
|
||||
let body = TimelineMetadataBodyV2 {
|
||||
disk_consistent_lsn: body.disk_consistent_lsn,
|
||||
prev_record_lsn: body.prev_record_lsn,
|
||||
ancestor_timeline: body.ancestor_timeline,
|
||||
ancestor_lsn: body.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: body.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: body.initdb_lsn,
|
||||
pg_version: 14, // All timelines created before this version had pg_version 14
|
||||
};
|
||||
|
||||
hdr.format_version = METADATA_FORMAT_VERSION;
|
||||
|
||||
Ok(Self { hdr, body })
|
||||
}
|
||||
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
let hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?;
|
||||
ensure!(
|
||||
hdr.format_version == STORAGE_FORMAT_VERSION,
|
||||
"format version mismatch"
|
||||
);
|
||||
|
||||
let metadata_size = hdr.size as usize;
|
||||
ensure!(
|
||||
metadata_size <= METADATA_MAX_SIZE,
|
||||
@@ -115,13 +166,20 @@ impl TimelineMetadata {
|
||||
hdr.checksum == calculated_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
let body = TimelineMetadataBody::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
ensure!(
|
||||
body.disk_consistent_lsn.is_aligned(),
|
||||
"disk_consistent_lsn is not aligned"
|
||||
);
|
||||
|
||||
Ok(TimelineMetadata { hdr, body })
|
||||
if hdr.format_version != METADATA_FORMAT_VERSION {
|
||||
// If metadata has the old format,
|
||||
// upgrade it and return the result
|
||||
TimelineMetadata::upgrade_timeline_metadata(metadata_bytes)
|
||||
} else {
|
||||
let body =
|
||||
TimelineMetadataBodyV2::des(&metadata_bytes[METADATA_HDR_SIZE..metadata_size])?;
|
||||
ensure!(
|
||||
body.disk_consistent_lsn.is_aligned(),
|
||||
"disk_consistent_lsn is not aligned"
|
||||
);
|
||||
Ok(TimelineMetadata { hdr, body })
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
@@ -129,7 +187,7 @@ impl TimelineMetadata {
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: STORAGE_FORMAT_VERSION,
|
||||
format_version: METADATA_FORMAT_VERSION,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
let hdr_bytes = hdr.ser()?;
|
||||
@@ -164,17 +222,10 @@ impl TimelineMetadata {
|
||||
pub fn initdb_lsn(&self) -> Lsn {
|
||||
self.body.initdb_lsn
|
||||
}
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: TimelineId,
|
||||
tenant_id: TenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(METADATA_FILE_NAME)
|
||||
pub fn pg_version(&self) -> u32 {
|
||||
self.body.pg_version
|
||||
}
|
||||
}
|
||||
|
||||
/// Save timeline metadata to file
|
||||
@@ -186,7 +237,7 @@ pub fn save_metadata(
|
||||
first_save: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("saving metadata").entered();
|
||||
let path = metadata_path(conf, timeline_id, tenant_id);
|
||||
let path = conf.metadata_path(timeline_id, tenant_id);
|
||||
// use OpenOptions to ensure file presence is consistent with first_save
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
@@ -227,6 +278,8 @@ mod tests {
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
// Any version will do here, so use the default
|
||||
crate::DEFAULT_PG_VERSION,
|
||||
);
|
||||
|
||||
let metadata_bytes = original_metadata
|
||||
@@ -241,4 +294,72 @@ mod tests {
|
||||
"Metadata that was serialized to bytes and deserialized back should not change"
|
||||
);
|
||||
}
|
||||
|
||||
// Generate old version metadata and read it with current code.
|
||||
// Ensure that it is upgraded correctly
|
||||
#[test]
|
||||
fn test_metadata_upgrade() {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct TimelineMetadataV1 {
|
||||
hdr: TimelineMetadataHeader,
|
||||
body: TimelineMetadataBodyV1,
|
||||
}
|
||||
|
||||
let metadata_v1 = TimelineMetadataV1 {
|
||||
hdr: TimelineMetadataHeader {
|
||||
checksum: 0,
|
||||
size: 0,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION,
|
||||
},
|
||||
body: TimelineMetadataBodyV1 {
|
||||
disk_consistent_lsn: Lsn(0x200),
|
||||
prev_record_lsn: Some(Lsn(0x100)),
|
||||
ancestor_timeline: Some(TIMELINE_ID),
|
||||
ancestor_lsn: Lsn(0),
|
||||
latest_gc_cutoff_lsn: Lsn(0),
|
||||
initdb_lsn: Lsn(0),
|
||||
},
|
||||
};
|
||||
|
||||
impl TimelineMetadataV1 {
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let body_bytes = self.body.ser()?;
|
||||
let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
|
||||
let hdr = TimelineMetadataHeader {
|
||||
size: metadata_size as u16,
|
||||
format_version: METADATA_OLD_FORMAT_VERSION,
|
||||
checksum: crc32c::crc32c(&body_bytes),
|
||||
};
|
||||
let hdr_bytes = hdr.ser()?;
|
||||
let mut metadata_bytes = vec![0u8; METADATA_MAX_SIZE];
|
||||
metadata_bytes[0..METADATA_HDR_SIZE].copy_from_slice(&hdr_bytes);
|
||||
metadata_bytes[METADATA_HDR_SIZE..metadata_size].copy_from_slice(&body_bytes);
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
let metadata_bytes = metadata_v1
|
||||
.to_bytes()
|
||||
.expect("Should serialize correct metadata to bytes");
|
||||
|
||||
// This should deserialize to the latest version format
|
||||
let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.expect("Should deserialize its own bytes");
|
||||
|
||||
let expected_metadata = TimelineMetadata::new(
|
||||
Lsn(0x200),
|
||||
Some(Lsn(0x100)),
|
||||
Some(TIMELINE_ID),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
Lsn(0),
|
||||
14, // All timelines created before this version had pg_version 14
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata.body, expected_metadata.body,
|
||||
"Metadata of the old version {} should be upgraded to the latest version {}",
|
||||
METADATA_OLD_FORMAT_VERSION, METADATA_FORMAT_VERSION
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,12 +24,12 @@ use crate::tenant::{
|
||||
image_layer::{ImageLayer, ImageLayerWriter},
|
||||
inmemory_layer::InMemoryLayer,
|
||||
layer_map::{LayerMap, SearchResult},
|
||||
metadata::{save_metadata, TimelineMetadata, METADATA_FILE_NAME},
|
||||
metadata::{save_metadata, TimelineMetadata},
|
||||
par_fsync,
|
||||
storage_layer::{Layer, ValueReconstructResult, ValueReconstructState},
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::config::{PageServerConf, METADATA_FILE_NAME};
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::pgdatadir_mapping::BlockNumber;
|
||||
@@ -37,7 +37,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
|
||||
use postgres_ffi::v14::xlog_utils::to_pg_timestamp;
|
||||
use postgres_ffi::to_pg_timestamp;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::{AtomicLsn, Lsn, RecordLsn},
|
||||
@@ -61,6 +61,8 @@ pub struct Timeline {
|
||||
pub tenant_id: TenantId,
|
||||
pub timeline_id: TimelineId,
|
||||
|
||||
pub pg_version: u32,
|
||||
|
||||
pub layers: RwLock<LayerMap>,
|
||||
|
||||
last_freeze_at: AtomicLsn,
|
||||
@@ -232,14 +234,16 @@ impl LogicalSize {
|
||||
}
|
||||
|
||||
fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
|
||||
let size_increment = self.size_added_after_initial.load(AtomicOrdering::Acquire);
|
||||
let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
|
||||
// ^^^ keep this type explicit so that the casts in this function break if
|
||||
// we change the type.
|
||||
match self.initial_logical_size.get() {
|
||||
Some(initial_size) => {
|
||||
let absolute_size_increment = u64::try_from(
|
||||
size_increment
|
||||
.checked_abs()
|
||||
.with_context(|| format!("Size added after initial {size_increment} is not expected to be i64::MIN"))?,
|
||||
).with_context(|| format!("Failed to convert size increment {size_increment} to u64"))?;
|
||||
).expect("casting nonnegative i64 to u64 should not fail");
|
||||
|
||||
if size_increment < 0 {
|
||||
initial_size.checked_sub(absolute_size_increment)
|
||||
@@ -249,11 +253,7 @@ impl LogicalSize {
|
||||
.map(CurrentLogicalSize::Exact)
|
||||
}
|
||||
None => {
|
||||
let non_negative_size_increment = if size_increment < 0 {
|
||||
0
|
||||
} else {
|
||||
u64::try_from(size_increment).expect("not negative, cannot fail")
|
||||
};
|
||||
let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
|
||||
Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
|
||||
}
|
||||
}
|
||||
@@ -343,7 +343,9 @@ impl Timeline {
|
||||
match cached_lsn.cmp(&lsn) {
|
||||
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
|
||||
Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
|
||||
Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn
|
||||
Ordering::Greater => {
|
||||
unreachable!("the returned lsn should never be after the requested lsn")
|
||||
}
|
||||
}
|
||||
Some((cached_lsn, cached_img))
|
||||
}
|
||||
@@ -535,6 +537,7 @@ impl Timeline {
|
||||
tenant_id: TenantId,
|
||||
walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
|
||||
upload_layers: bool,
|
||||
pg_version: u32,
|
||||
) -> Timeline {
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
|
||||
@@ -543,6 +546,7 @@ impl Timeline {
|
||||
tenant_conf,
|
||||
timeline_id,
|
||||
tenant_id,
|
||||
pg_version,
|
||||
layers: RwLock::new(LayerMap::default()),
|
||||
|
||||
walredo_mgr,
|
||||
@@ -623,7 +627,7 @@ impl Timeline {
|
||||
.unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
|
||||
drop(tenant_conf_guard);
|
||||
let self_clone = Arc::clone(self);
|
||||
let _ = spawn_connection_manager_task(
|
||||
spawn_connection_manager_task(
|
||||
self.conf.broker_etcd_prefix.clone(),
|
||||
self_clone,
|
||||
walreceiver_connect_timeout,
|
||||
@@ -724,10 +728,10 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn layer_removal_guard(&self) -> Result<MutexGuard<()>, anyhow::Error> {
|
||||
pub fn layer_removal_guard(&self) -> anyhow::Result<MutexGuard<()>> {
|
||||
self.layer_removal_cs
|
||||
.try_lock()
|
||||
.map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}"))
|
||||
.map_err(|e| anyhow!("cannot lock compaction critical section {e}"))
|
||||
}
|
||||
|
||||
/// Retrieve current logical size of the timeline.
|
||||
@@ -1262,6 +1266,7 @@ impl Timeline {
|
||||
self.ancestor_lsn,
|
||||
*self.latest_gc_cutoff_lsn.read(),
|
||||
self.initdb_lsn,
|
||||
self.pg_version,
|
||||
);
|
||||
|
||||
fail_point!("checkpoint-before-saving-metadata", |x| bail!(
|
||||
@@ -1918,18 +1923,19 @@ impl Timeline {
|
||||
|
||||
let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
|
||||
|
||||
let _enter =
|
||||
info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff)
|
||||
.entered();
|
||||
|
||||
// Nothing to GC. Return early.
|
||||
let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
|
||||
if latest_gc_cutoff >= new_gc_cutoff {
|
||||
info!(
|
||||
"Nothing to GC for timeline {}: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
|
||||
self.timeline_id
|
||||
"Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
|
||||
);
|
||||
return Ok(result);
|
||||
}
|
||||
|
||||
let _enter = info_span!("garbage collection", timeline = %self.timeline_id, tenant = %self.tenant_id, cutoff = %new_gc_cutoff).entered();
|
||||
|
||||
// We need to ensure that no one tries to read page versions or create
|
||||
// branches at a point before latest_gc_cutoff_lsn. See branch_timeline()
|
||||
// for details. This will block until the old value is no longer in use.
|
||||
@@ -2051,7 +2057,7 @@ impl Timeline {
|
||||
l.filename().display(),
|
||||
l.is_incremental(),
|
||||
);
|
||||
layers_to_remove.push(Arc::clone(l));
|
||||
layers_to_remove.push(Arc::clone(&l));
|
||||
}
|
||||
|
||||
// Actually delete the layers from disk and remove them from the map.
|
||||
@@ -2134,9 +2140,13 @@ impl Timeline {
|
||||
|
||||
let last_rec_lsn = data.records.last().unwrap().0;
|
||||
|
||||
let img =
|
||||
self.walredo_mgr
|
||||
.request_redo(key, request_lsn, base_img, data.records)?;
|
||||
let img = self.walredo_mgr.request_redo(
|
||||
key,
|
||||
request_lsn,
|
||||
base_img,
|
||||
data.records,
|
||||
self.pg_version,
|
||||
)?;
|
||||
|
||||
if img.len() == page_cache::PAGE_SZ {
|
||||
let cache = page_cache::get();
|
||||
|
||||
@@ -8,14 +8,9 @@
|
||||
//! We cannot use global or default config instead, because wrong settings
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::num::NonZeroU64;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use utils::id::TenantId;
|
||||
|
||||
pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
pub mod defaults {
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
@@ -224,12 +219,6 @@ impl TenantConf {
|
||||
}
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn path(conf: &'static PageServerConf, tenant_id: TenantId) -> PathBuf {
|
||||
conf.tenant_path(&tenant_id).join(TENANT_CONFIG_NAME)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf() -> Self {
|
||||
TenantConf {
|
||||
|
||||
@@ -10,23 +10,21 @@ use std::sync::Arc;
|
||||
use anyhow::Context;
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::{path_with_suffix_extension, GenericRemoteStorage};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::config::{PageServerConf, METADATA_FILE_NAME};
|
||||
use crate::http::models::TenantInfo;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
|
||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
Tenant, TenantState,
|
||||
ephemeral_file::is_ephemeral_file, metadata::TimelineMetadata, Tenant, TenantState,
|
||||
};
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{TenantTimelineValues, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe_dir;
|
||||
use utils::crashsafe_dir::{self, path_with_suffix_extension};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
mod tenants_state {
|
||||
@@ -109,6 +107,13 @@ pub fn init_tenant_mgr(
|
||||
/// Ignores other timelines that might be present for tenant, but were not passed as a parameter.
|
||||
/// Attempts to load as many entites as possible: if a certain timeline fails during the load, the tenant is marked as "Broken",
|
||||
/// and the load continues.
|
||||
///
|
||||
/// For successful tenant attach, it first has to have a `timelines/` subdirectory and a tenant config file that's loaded into memory successfully.
|
||||
/// If either of the conditions fails, the tenant will be added to memory with [`TenantState::Broken`] state, otherwise we start to load its timelines.
|
||||
/// Alternatively, tenant is considered loaded successfully, if it's already in pageserver's memory (i.e. was loaded already before).
|
||||
///
|
||||
/// Attach happens on startup and sucessful timeline downloads
|
||||
/// (some subset of timeline files, always including its metadata, after which the new one needs to be registered).
|
||||
pub fn attach_local_tenants(
|
||||
conf: &'static PageServerConf,
|
||||
remote_index: &RemoteIndex,
|
||||
@@ -124,18 +129,20 @@ pub fn attach_local_tenants(
|
||||
);
|
||||
debug!("Timelines to attach: {local_timelines:?}");
|
||||
|
||||
let tenant = load_local_tenant(conf, tenant_id, remote_index);
|
||||
{
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(_) => {
|
||||
error!("Cannot attach tenant {tenant_id}: there's already an entry in the tenant state");
|
||||
continue;
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(Arc::clone(&tenant));
|
||||
}
|
||||
let mut tenants_accessor = tenants_state::write_tenants();
|
||||
let tenant = match tenants_accessor.entry(tenant_id) {
|
||||
hash_map::Entry::Occupied(o) => {
|
||||
info!("Tenant {tenant_id} was found in pageserver's memory");
|
||||
Arc::clone(o.get())
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
info!("Tenant {tenant_id} was not found in pageserver's memory, loading it");
|
||||
let tenant = load_local_tenant(conf, tenant_id, remote_index);
|
||||
v.insert(Arc::clone(&tenant));
|
||||
tenant
|
||||
}
|
||||
};
|
||||
drop(tenants_accessor);
|
||||
|
||||
if tenant.current_state() == TenantState::Broken {
|
||||
warn!("Skipping timeline load for broken tenant {tenant_id}")
|
||||
@@ -170,16 +177,28 @@ fn load_local_tenant(
|
||||
remote_index.clone(),
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
match Tenant::load_tenant_config(conf, tenant_id) {
|
||||
Ok(tenant_conf) => {
|
||||
tenant.update_tenant_config(tenant_conf);
|
||||
tenant.activate(false);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
|
||||
let tenant_timelines_dir = conf.timelines_path(&tenant_id);
|
||||
if !tenant_timelines_dir.is_dir() {
|
||||
error!(
|
||||
"Tenant {} has no timelines directory at {}",
|
||||
tenant_id,
|
||||
tenant_timelines_dir.display()
|
||||
);
|
||||
tenant.set_state(TenantState::Broken);
|
||||
} else {
|
||||
match Tenant::load_tenant_config(conf, tenant_id) {
|
||||
Ok(tenant_conf) => {
|
||||
tenant.update_tenant_config(tenant_conf);
|
||||
tenant.activate(false);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Failed to read config for tenant {tenant_id}, disabling tenant: {e:?}");
|
||||
tenant.set_state(TenantState::Broken);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
tenant
|
||||
}
|
||||
|
||||
@@ -246,7 +265,7 @@ fn create_tenant_files(
|
||||
&temporary_tenant_dir,
|
||||
)?;
|
||||
let temporary_tenant_config_path = rebase_directory(
|
||||
&TenantConf::path(conf, tenant_id),
|
||||
&conf.tenant_config_path(tenant_id),
|
||||
&target_tenant_directory,
|
||||
&temporary_tenant_dir,
|
||||
)?;
|
||||
@@ -343,7 +362,7 @@ pub fn update_tenant_config(
|
||||
) -> anyhow::Result<()> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
get_tenant(tenant_id, true)?.update_tenant_config(tenant_conf);
|
||||
Tenant::persist_tenant_config(&TenantConf::path(conf, tenant_id), tenant_conf, false)?;
|
||||
Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -627,14 +646,10 @@ fn collect_timelines_for_tenant(
|
||||
}
|
||||
|
||||
if tenant_timelines.is_empty() {
|
||||
match remove_if_empty(&timelines_dir) {
|
||||
Ok(true) => info!(
|
||||
"Removed empty tenant timelines directory {}",
|
||||
timelines_dir.display()
|
||||
),
|
||||
Ok(false) => (),
|
||||
Err(e) => error!("Failed to remove empty tenant timelines directory: {e:?}"),
|
||||
}
|
||||
// this is normal, we've removed all broken, empty and temporary timeline dirs
|
||||
// but should allow the tenant to stay functional and allow creating new timelines
|
||||
// on a restart, we require tenants to have the timelines dir, so leave it on disk
|
||||
debug!("Tenant {tenant_id} has no timelines loaded");
|
||||
}
|
||||
|
||||
Ok((tenant_id, tenant_timelines))
|
||||
|
||||
@@ -21,7 +21,9 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
||||
&format!("compactor for tenant {tenant_id}"),
|
||||
false,
|
||||
async move {
|
||||
compaction_loop(tenant_id).await;
|
||||
compaction_loop(tenant_id)
|
||||
.instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
@@ -33,7 +35,9 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
||||
&format!("garbage collector for tenant {tenant_id}"),
|
||||
false,
|
||||
async move {
|
||||
gc_loop(tenant_id).await;
|
||||
gc_loop(tenant_id)
|
||||
.instrument(info_span!("gc_loop", tenant_id = %tenant_id))
|
||||
.await;
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
@@ -44,7 +48,7 @@ pub fn start_background_loops(tenant_id: TenantId) {
|
||||
///
|
||||
async fn compaction_loop(tenant_id: TenantId) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting compaction loop for {tenant_id}");
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
loop {
|
||||
@@ -52,7 +56,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("received compaction cancellation request");
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
@@ -73,7 +77,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("received compaction cancellation request during idling");
|
||||
info!("received cancellation request during idling");
|
||||
break ;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
@@ -91,7 +95,7 @@ async fn compaction_loop(tenant_id: TenantId) {
|
||||
///
|
||||
async fn gc_loop(tenant_id: TenantId) {
|
||||
let wait_duration = Duration::from_secs(2);
|
||||
info!("starting gc loop for {tenant_id}");
|
||||
info!("starting");
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
async {
|
||||
loop {
|
||||
@@ -99,7 +103,7 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
|
||||
let tenant = tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("received GC cancellation request");
|
||||
info!("received cancellation request");
|
||||
return;
|
||||
},
|
||||
tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
|
||||
@@ -123,7 +127,7 @@ async fn gc_loop(tenant_id: TenantId) {
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = task_mgr::shutdown_watcher() => {
|
||||
info!("received GC cancellation request during idling");
|
||||
info!("received cancellation request during idling");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
//!
|
||||
//! Timeline management code
|
||||
//
|
||||
|
||||
use std::{
|
||||
fs,
|
||||
path::Path,
|
||||
process::{Command, Stdio},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use tracing::*;
|
||||
|
||||
use remote_storage::path_with_suffix_extension;
|
||||
use utils::{
|
||||
id::{TenantId, TimelineId},
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::tenant::{Tenant, Timeline};
|
||||
use crate::tenant_mgr;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{import_datadir, TEMP_FILE_SUFFIX};
|
||||
|
||||
// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
// to get bootstrap data for timeline initialization.
|
||||
//
|
||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
info!("running initdb in {}... ", initdbpath.display());
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_output = Command::new(initdb_path)
|
||||
.args(&["-D", &initdbpath.to_string_lossy()])
|
||||
.args(&["-U", &conf.superuser])
|
||||
.args(&["-E", "utf8"])
|
||||
.arg("--no-instructions")
|
||||
// This is only used for a temporary installation that is deleted shortly after,
|
||||
// so no need to fsync it
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.context("failed to execute initdb")?;
|
||||
if !initdb_output.status.success() {
|
||||
bail!(
|
||||
"initdb failed: '{}'",
|
||||
String::from_utf8_lossy(&initdb_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
//
|
||||
// - run initdb to init temporary instance and get bootstrap data
|
||||
// - after initialization complete, remove the temp dir.
|
||||
//
|
||||
fn bootstrap_timeline(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
// create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
|
||||
// temporary directory for basebackup files for the given timeline.
|
||||
let initdb_path = path_with_suffix_extension(
|
||||
conf.timelines_path(&tenant_id)
|
||||
.join(format!("basebackup-{timeline_id}")),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
// Init temporarily repo to get bootstrap data
|
||||
run_initdb(conf, &initdb_path)?;
|
||||
let pgdata_path = initdb_path;
|
||||
|
||||
let lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
// Initdb lsn will be equal to last_record_lsn which will be set after import.
|
||||
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
|
||||
let timeline = tenant.create_empty_timeline(timeline_id, lsn)?;
|
||||
import_datadir::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
|
||||
fail::fail_point!("before-checkpoint-new-timeline", |_| {
|
||||
bail!("failpoint before-checkpoint-new-timeline");
|
||||
});
|
||||
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
info!(
|
||||
"created root timeline {} timeline.lsn {}",
|
||||
timeline_id,
|
||||
timeline.get_last_record_lsn()
|
||||
);
|
||||
|
||||
// Remove temp dir. We don't need it anymore
|
||||
fs::remove_dir_all(pgdata_path)?;
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
///
|
||||
/// Create a new timeline.
|
||||
///
|
||||
/// Returns the new timeline ID and reference to its Timeline object.
|
||||
///
|
||||
/// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
|
||||
/// the same timeline ID already exists, returns None. If `new_timeline_id` is not given,
|
||||
/// a new unique ID is generated.
|
||||
///
|
||||
pub(crate) async fn create_timeline(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
new_timeline_id: Option<TimelineId>,
|
||||
ancestor_timeline_id: Option<TimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
) -> Result<Option<Arc<Timeline>>> {
|
||||
let new_timeline_id = new_timeline_id.unwrap_or_else(TimelineId::generate);
|
||||
let tenant = tenant_mgr::get_tenant(tenant_id, true)?;
|
||||
|
||||
if conf.timeline_path(&new_timeline_id, &tenant_id).exists() {
|
||||
debug!("timeline {new_timeline_id} already exists");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let loaded_timeline = match ancestor_timeline_id {
|
||||
Some(ancestor_timeline_id) => {
|
||||
let ancestor_timeline = tenant
|
||||
.get_timeline(ancestor_timeline_id)
|
||||
.context("Cannot branch off the timeline that's not present in pageserver")?;
|
||||
|
||||
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||
// to the requested branch point. The repository code itself doesn't
|
||||
// require it, but if we start to receive WAL on the new timeline,
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
*lsn = lsn.align();
|
||||
ancestor_timeline.wait_lsn(*lsn).await?;
|
||||
|
||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||
if ancestor_ancestor_lsn > *lsn {
|
||||
// can we safely just branch from the ancestor instead?
|
||||
anyhow::bail!(
|
||||
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
|
||||
lsn,
|
||||
ancestor_timeline_id,
|
||||
ancestor_ancestor_lsn,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
tenant.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?
|
||||
}
|
||||
None => bootstrap_timeline(conf, tenant_id, new_timeline_id, &tenant)?,
|
||||
};
|
||||
|
||||
// Have added new timeline into the tenant, now its background tasks are needed.
|
||||
tenant.activate(true);
|
||||
|
||||
Ok(Some(loaded_timeline))
|
||||
}
|
||||
@@ -34,8 +34,9 @@ use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walrecord::*;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::v14::xlog_utils::*;
|
||||
use postgres_ffi::v14::CheckPoint;
|
||||
use postgres_ffi::TransactionId;
|
||||
@@ -82,7 +83,8 @@ impl<'a> WalIngest<'a> {
|
||||
decoded: &mut DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
modification.lsn = lsn;
|
||||
decode_wal_record(recdata, decoded).context("failed decoding wal record")?;
|
||||
decode_wal_record(recdata, decoded, self.timeline.pg_version)
|
||||
.context("failed decoding wal record")?;
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
@@ -113,18 +115,49 @@ impl<'a> WalIngest<'a> {
|
||||
let truncate = XlSmgrTruncate::decode(&mut buf);
|
||||
self.ingest_xlog_smgr_truncate(modification, &truncate)?;
|
||||
} else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
|
||||
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||
self.ingest_xlog_dbase_create(modification, &createdb)?;
|
||||
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== pg_constants::XLOG_DBASE_DROP
|
||||
{
|
||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||
for tablespace_id in dropdb.tablespace_ids {
|
||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||
modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
|
||||
debug!(
|
||||
"handle RM_DBASE_ID for Postgres version {:?}",
|
||||
self.timeline.pg_version
|
||||
);
|
||||
if self.timeline.pg_version == 14 {
|
||||
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
|
||||
{
|
||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||
debug!("XLOG_DBASE_CREATE v14");
|
||||
|
||||
self.ingest_xlog_dbase_create(modification, &createdb)?;
|
||||
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== postgres_ffi::v14::bindings::XLOG_DBASE_DROP
|
||||
{
|
||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||
for tablespace_id in dropdb.tablespace_ids {
|
||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||
modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
|
||||
}
|
||||
}
|
||||
} else if self.timeline.pg_version == 15 {
|
||||
if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG
|
||||
{
|
||||
debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY
|
||||
{
|
||||
// The XLOG record was renamed between v14 and v15,
|
||||
// but the record format is the same.
|
||||
// So we can reuse XlCreateDatabase here.
|
||||
debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||
let createdb = XlCreateDatabase::decode(&mut buf);
|
||||
self.ingest_xlog_dbase_create(modification, &createdb)?;
|
||||
} else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
|
||||
== postgres_ffi::v15::bindings::XLOG_DBASE_DROP
|
||||
{
|
||||
let dropdb = XlDropDatabase::decode(&mut buf);
|
||||
for tablespace_id in dropdb.tablespace_ids {
|
||||
trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
|
||||
modification.drop_dbdir(tablespace_id, dropdb.db_id)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
|
||||
@@ -291,7 +324,7 @@ impl<'a> WalIngest<'a> {
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||
&& (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0
|
||||
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
|
||||
{
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
@@ -392,7 +425,7 @@ impl<'a> WalIngest<'a> {
|
||||
// Clear the VM bits if required.
|
||||
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
|
||||
let vm_rel = RelTag {
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
spcnode: decoded.blocks[0].rnode_spcnode,
|
||||
dbnode: decoded.blocks[0].rnode_dbnode,
|
||||
relnode: decoded.blocks[0].rnode_relnode,
|
||||
@@ -568,7 +601,7 @@ impl<'a> WalIngest<'a> {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::MAIN_FORKNUM,
|
||||
forknum: MAIN_FORKNUM,
|
||||
};
|
||||
self.put_rel_truncation(modification, rel, rec.blkno)?;
|
||||
}
|
||||
@@ -577,7 +610,7 @@ impl<'a> WalIngest<'a> {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::FSM_FORKNUM,
|
||||
forknum: FSM_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: 'blkno' stored in the WAL record is the new size of the
|
||||
@@ -600,7 +633,7 @@ impl<'a> WalIngest<'a> {
|
||||
spcnode,
|
||||
dbnode,
|
||||
relnode,
|
||||
forknum: pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
};
|
||||
|
||||
// FIXME: Like with the FSM above, the logic to truncate the VM
|
||||
@@ -672,7 +705,7 @@ impl<'a> WalIngest<'a> {
|
||||
)?;
|
||||
|
||||
for xnode in &parsed.xnodes {
|
||||
for forknum in pg_constants::MAIN_FORKNUM..=pg_constants::VISIBILITYMAP_FORKNUM {
|
||||
for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM {
|
||||
let rel = RelTag {
|
||||
forknum,
|
||||
spcnode: xnode.spcnode,
|
||||
@@ -1032,6 +1065,8 @@ mod tests {
|
||||
use postgres_ffi::v14::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use postgres_ffi::RELSEG_SIZE;
|
||||
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A: RelTag = RelTag {
|
||||
spcnode: 0,
|
||||
@@ -1059,7 +1094,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_relsize")?.load();
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
@@ -1187,7 +1222,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_drop_extend")?.load();
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut m = tline.begin_modification(Lsn(0x20));
|
||||
@@ -1227,7 +1262,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_truncate_extend")?.load();
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
// Create a 20 MB relation (the size is arbitrary)
|
||||
@@ -1315,7 +1350,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let tenant = TenantHarness::create("test_large_rel")?.load();
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID)?;
|
||||
let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
|
||||
let mut walingest = init_walingest_test(&*tline)?;
|
||||
|
||||
let mut lsn = 0x10;
|
||||
|
||||
@@ -31,7 +31,6 @@ use etcd_broker::Client;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::future::Future;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::watch;
|
||||
use tracing::*;
|
||||
use url::Url;
|
||||
@@ -88,37 +87,44 @@ pub fn is_etcd_client_initialized() -> bool {
|
||||
/// That may lead to certain events not being observed by the listener.
|
||||
#[derive(Debug)]
|
||||
pub struct TaskHandle<E> {
|
||||
events_receiver: watch::Receiver<TaskEvent<E>>,
|
||||
join_handle: Option<tokio::task::JoinHandle<anyhow::Result<()>>>,
|
||||
events_receiver: watch::Receiver<TaskStateUpdate<E>>,
|
||||
cancellation: watch::Sender<()>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum TaskEvent<E> {
|
||||
Update(TaskStateUpdate<E>),
|
||||
End(anyhow::Result<()>),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum TaskStateUpdate<E> {
|
||||
Init,
|
||||
Started,
|
||||
NewEvent(E),
|
||||
End,
|
||||
Progress(E),
|
||||
}
|
||||
|
||||
impl<E: Clone> TaskHandle<E> {
|
||||
/// Initializes the task, starting it immediately after the creation.
|
||||
pub fn spawn<Fut>(
|
||||
task: impl FnOnce(Arc<watch::Sender<TaskEvent<E>>>, watch::Receiver<()>) -> Fut + Send + 'static,
|
||||
task: impl FnOnce(watch::Sender<TaskStateUpdate<E>>, watch::Receiver<()>) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
) -> Self
|
||||
where
|
||||
Fut: Future<Output = Result<(), String>> + Send,
|
||||
E: Sync + Send + 'static,
|
||||
Fut: Future<Output = anyhow::Result<()>> + Send,
|
||||
E: Send + Sync + 'static,
|
||||
{
|
||||
let (cancellation, cancellation_receiver) = watch::channel(());
|
||||
let (events_sender, events_receiver) = watch::channel(TaskEvent::Started);
|
||||
let events_sender = Arc::new(events_sender);
|
||||
let (events_sender, events_receiver) = watch::channel(TaskStateUpdate::Started);
|
||||
|
||||
let sender = Arc::clone(&events_sender);
|
||||
let _ = WALRECEIVER_RUNTIME.spawn(async move {
|
||||
events_sender.send(TaskEvent::Started).ok();
|
||||
task(sender, cancellation_receiver).await
|
||||
let join_handle = WALRECEIVER_RUNTIME.spawn(async move {
|
||||
events_sender.send(TaskStateUpdate::Started).ok();
|
||||
task(events_sender, cancellation_receiver).await
|
||||
});
|
||||
|
||||
TaskHandle {
|
||||
join_handle: Some(join_handle),
|
||||
events_receiver,
|
||||
cancellation,
|
||||
}
|
||||
@@ -126,15 +132,45 @@ impl<E: Clone> TaskHandle<E> {
|
||||
|
||||
async fn next_task_event(&mut self) -> TaskEvent<E> {
|
||||
match self.events_receiver.changed().await {
|
||||
Ok(()) => self.events_receiver.borrow().clone(),
|
||||
Err(_task_channel_part_dropped) => TaskEvent::End,
|
||||
Ok(()) => TaskEvent::Update((self.events_receiver.borrow()).clone()),
|
||||
Err(_task_channel_part_dropped) => {
|
||||
TaskEvent::End(match self.join_handle.take() {
|
||||
Some(jh) => {
|
||||
if !jh.is_finished() {
|
||||
warn!("sender is dropped while join handle is still alive");
|
||||
}
|
||||
|
||||
jh.await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to join task: {e}"))
|
||||
.and_then(|x| x)
|
||||
}
|
||||
None => {
|
||||
// Another option is to have an enum, join handle or result and give away the reference to it
|
||||
Err(anyhow::anyhow!("Task was joined more than once"))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Aborts current task, waiting for it to finish.
|
||||
pub async fn shutdown(mut self) {
|
||||
self.cancellation.send(()).ok();
|
||||
// wait until the sender is dropped
|
||||
while self.events_receiver.changed().await.is_ok() {}
|
||||
pub async fn shutdown(self) {
|
||||
match self.join_handle {
|
||||
Some(jh) => {
|
||||
self.cancellation.send(()).ok();
|
||||
match jh.await {
|
||||
Ok(Ok(())) => debug!("Shutdown success"),
|
||||
Ok(Err(e)) => error!("Shutdown task error: {e:?}"),
|
||||
Err(join_error) => {
|
||||
if join_error.is_cancelled() {
|
||||
error!("Shutdown task was cancelled");
|
||||
} else {
|
||||
error!("Shutdown task join error: {join_error}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,10 +16,10 @@ use std::{
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::task_mgr::WALRECEIVER_RUNTIME;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::{task_mgr, walreceiver::TaskStateUpdate};
|
||||
use anyhow::Context;
|
||||
use chrono::{NaiveDateTime, Utc};
|
||||
use etcd_broker::{
|
||||
@@ -58,7 +58,10 @@ pub fn spawn_connection_manager_task(
|
||||
TaskKind::WalReceiverManager,
|
||||
Some(tenant_id),
|
||||
Some(timeline_id),
|
||||
&format!("walreceiver for tenant {} timeline {}", timeline.tenant_id, timeline.timeline_id),
|
||||
&format!(
|
||||
"walreceiver for tenant {} timeline {}",
|
||||
timeline.tenant_id, timeline.timeline_id
|
||||
),
|
||||
false,
|
||||
async move {
|
||||
info!("WAL receiver broker started, connecting to etcd");
|
||||
@@ -88,7 +91,9 @@ pub fn spawn_connection_manager_task(
|
||||
}
|
||||
}
|
||||
}
|
||||
.instrument(info_span!("wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
|
||||
.instrument(
|
||||
info_span!("wal_connection_manager", tenant = %tenant_id, timeline = %timeline_id),
|
||||
),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -140,19 +145,26 @@ async fn connection_manager_loop_step(
|
||||
let wal_connection = walreceiver_state.wal_connection.as_mut()
|
||||
.expect("Should have a connection, as checked by the corresponding select! guard");
|
||||
match wal_connection_update {
|
||||
TaskEvent::Started => {},
|
||||
TaskEvent::NewEvent(status) => {
|
||||
if status.has_processed_wal {
|
||||
// We have advanced last_record_lsn by processing the WAL received
|
||||
// from this safekeeper. This is good enough to clean unsuccessful
|
||||
// retries history and allow reconnecting to this safekeeper without
|
||||
// sleeping for a long time.
|
||||
walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
|
||||
TaskEvent::Update(c) => {
|
||||
match c {
|
||||
TaskStateUpdate::Init | TaskStateUpdate::Started => {},
|
||||
TaskStateUpdate::Progress(status) => {
|
||||
if status.has_processed_wal {
|
||||
// We have advanced last_record_lsn by processing the WAL received
|
||||
// from this safekeeper. This is good enough to clean unsuccessful
|
||||
// retries history and allow reconnecting to this safekeeper without
|
||||
// sleeping for a long time.
|
||||
walreceiver_state.wal_connection_retries.remove(&wal_connection.sk_id);
|
||||
}
|
||||
wal_connection.status = status.to_owned();
|
||||
}
|
||||
}
|
||||
wal_connection.status = status;
|
||||
},
|
||||
TaskEvent::End => {
|
||||
debug!("WAL receiving task finished");
|
||||
TaskEvent::End(walreceiver_task_result) => {
|
||||
match walreceiver_task_result {
|
||||
Ok(()) => debug!("WAL receiving task finished"),
|
||||
Err(e) => error!("wal receiver task finished with an error: {e:?}"),
|
||||
}
|
||||
walreceiver_state.drop_old_connection(false).await;
|
||||
},
|
||||
}
|
||||
@@ -358,13 +370,13 @@ impl WalreceiverState {
|
||||
async move {
|
||||
super::walreceiver_connection::handle_walreceiver_connection(
|
||||
timeline,
|
||||
&new_wal_source_connstr,
|
||||
events_sender.as_ref(),
|
||||
new_wal_source_connstr,
|
||||
events_sender,
|
||||
cancellation,
|
||||
connect_timeout,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| format!("walreceiver connection handling failure: {e:#}"))
|
||||
.context("walreceiver connection handling failure")
|
||||
}
|
||||
.instrument(info_span!("walreceiver_connection", id = %id))
|
||||
});
|
||||
@@ -880,7 +892,7 @@ mod tests {
|
||||
status: connection_status.clone(),
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskEvent::NewEvent(connection_status.clone()))
|
||||
.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
@@ -1140,7 +1152,7 @@ mod tests {
|
||||
status: connection_status.clone(),
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskEvent::NewEvent(connection_status.clone()))
|
||||
.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
@@ -1228,7 +1240,7 @@ mod tests {
|
||||
status: connection_status.clone(),
|
||||
connection_task: TaskHandle::spawn(move |sender, _| async move {
|
||||
sender
|
||||
.send(TaskEvent::NewEvent(connection_status.clone()))
|
||||
.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
.ok();
|
||||
Ok(())
|
||||
}),
|
||||
@@ -1353,7 +1365,7 @@ mod tests {
|
||||
|
||||
const DUMMY_SAFEKEEPER_CONNSTR: &str = "safekeeper_connstr";
|
||||
|
||||
fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
|
||||
fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
|
||||
WalreceiverState {
|
||||
id: TenantTimelineId {
|
||||
tenant_id: harness.tenant_id,
|
||||
@@ -1361,7 +1373,7 @@ mod tests {
|
||||
},
|
||||
timeline: harness
|
||||
.load()
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0))
|
||||
.create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
|
||||
.expect("Failed to create an empty timeline for dummy wal connection manager"),
|
||||
wal_connect_timeout: Duration::from_secs(1),
|
||||
lagging_wal_timeout: Duration::from_secs(1),
|
||||
|
||||
@@ -16,10 +16,9 @@ use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use tokio::{pin, select, sync::watch, time};
|
||||
use tokio_postgres::{replication::ReplicationStream, Client};
|
||||
use tracing::{debug, error, info, info_span, trace, warn, Instrument};
|
||||
use tracing::{debug, error, info, trace, warn};
|
||||
|
||||
use super::TaskEvent;
|
||||
use crate::metrics::LIVE_CONNECTIONS_COUNT;
|
||||
use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
|
||||
use crate::{
|
||||
task_mgr,
|
||||
task_mgr::TaskKind,
|
||||
@@ -29,7 +28,7 @@ use crate::{
|
||||
walingest::WalIngest,
|
||||
walrecord::DecodedWALRecord,
|
||||
};
|
||||
use postgres_ffi::v14::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::waldecoder::WalStreamDecoder;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::{lsn::Lsn, pq_proto::ReplicationFeedback};
|
||||
|
||||
@@ -55,8 +54,8 @@ pub struct WalConnectionStatus {
|
||||
/// messages as we go.
|
||||
pub async fn handle_walreceiver_connection(
|
||||
timeline: Arc<Timeline>,
|
||||
wal_source_connstr: &str,
|
||||
events_sender: &watch::Sender<TaskEvent<WalConnectionStatus>>,
|
||||
wal_source_connstr: String,
|
||||
events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
|
||||
mut cancellation: watch::Receiver<()>,
|
||||
connect_timeout: Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -81,7 +80,7 @@ pub async fn handle_walreceiver_connection(
|
||||
streaming_lsn: None,
|
||||
commit_lsn: None,
|
||||
};
|
||||
if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
|
||||
warn!("Wal connection event listener dropped right after connection init, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -112,8 +111,7 @@ pub async fn handle_walreceiver_connection(
|
||||
_ = connection_cancellation.changed() => info!("Connection cancelled"),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
.instrument(info_span!("walreceiver connection")),
|
||||
},
|
||||
);
|
||||
|
||||
// Immediately increment the gauge, then create a job to decrement it on task exit.
|
||||
@@ -134,7 +132,7 @@ pub async fn handle_walreceiver_connection(
|
||||
connection_status.latest_connection_update = Utc::now().naive_utc();
|
||||
connection_status.latest_wal_update = Utc::now().naive_utc();
|
||||
connection_status.commit_lsn = Some(end_of_wal);
|
||||
if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
|
||||
warn!("Wal connection event listener dropped after IDENTIFY_SYSTEM, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -166,7 +164,7 @@ pub async fn handle_walreceiver_connection(
|
||||
let physical_stream = ReplicationStream::new(copy_stream);
|
||||
pin!(physical_stream);
|
||||
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
|
||||
|
||||
let mut walingest = WalIngest::new(timeline.as_ref(), startpoint)?;
|
||||
|
||||
@@ -202,7 +200,7 @@ pub async fn handle_walreceiver_connection(
|
||||
}
|
||||
&_ => {}
|
||||
};
|
||||
if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone())) {
|
||||
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
@@ -268,7 +266,8 @@ pub async fn handle_walreceiver_connection(
|
||||
if !connection_status.has_processed_wal && last_rec_lsn > last_rec_lsn_before_msg {
|
||||
// We have successfully processed at least one WAL record.
|
||||
connection_status.has_processed_wal = true;
|
||||
if let Err(e) = events_sender.send(TaskEvent::NewEvent(connection_status.clone())) {
|
||||
if let Err(e) = events_sender.send(TaskStateUpdate::Progress(connection_status.clone()))
|
||||
{
|
||||
warn!("Wal connection event listener dropped, aborting the connection: {e}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
@@ -3,12 +3,11 @@
|
||||
//!
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::v14::xlog_utils::XLOG_SIZE_OF_XLOG_RECORD;
|
||||
use postgres_ffi::v14::XLogRecord;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{BlockNumber, OffsetNumber, TimestampTz};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
|
||||
use postgres_ffi::{XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
use utils::bin_ser::DeserializeError;
|
||||
@@ -390,6 +389,16 @@ impl XlXactParsedRecord {
|
||||
xid = buf.get_u32_le();
|
||||
trace!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE");
|
||||
}
|
||||
|
||||
if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
|
||||
let nitems = buf.get_i32_le();
|
||||
debug!(
|
||||
"XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}",
|
||||
nitems
|
||||
);
|
||||
//FIXME: do we need to handle dropped stats here?
|
||||
}
|
||||
|
||||
XlXactParsedRecord {
|
||||
xid,
|
||||
info,
|
||||
@@ -517,7 +526,8 @@ impl XlMultiXactTruncate {
|
||||
pub fn decode_wal_record(
|
||||
record: Bytes,
|
||||
decoded: &mut DecodedWALRecord,
|
||||
) -> Result<(), DeserializeError> {
|
||||
pg_version: u32,
|
||||
) -> Result<()> {
|
||||
let mut rnode_spcnode: u32 = 0;
|
||||
let mut rnode_dbnode: u32 = 0;
|
||||
let mut rnode_relnode: u32 = 0;
|
||||
@@ -610,9 +620,21 @@ pub fn decode_wal_record(
|
||||
blk.hole_offset = buf.get_u16_le();
|
||||
blk.bimg_info = buf.get_u8();
|
||||
|
||||
blk.apply_image = (blk.bimg_info & pg_constants::BKPIMAGE_APPLY) != 0;
|
||||
blk.apply_image = if pg_version == 14 {
|
||||
(blk.bimg_info & postgres_ffi::v14::bindings::BKPIMAGE_APPLY) != 0
|
||||
} else {
|
||||
assert_eq!(pg_version, 15);
|
||||
(blk.bimg_info & postgres_ffi::v15::bindings::BKPIMAGE_APPLY) != 0
|
||||
};
|
||||
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED != 0 {
|
||||
let blk_img_is_compressed =
|
||||
postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)?;
|
||||
|
||||
if blk_img_is_compressed {
|
||||
debug!("compressed block image , pg_version = {}", pg_version);
|
||||
}
|
||||
|
||||
if blk_img_is_compressed {
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
|
||||
blk.hole_length = buf.get_u16_le();
|
||||
} else {
|
||||
@@ -665,9 +687,7 @@ pub fn decode_wal_record(
|
||||
* cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
|
||||
* flag is set.
|
||||
*/
|
||||
if (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0)
|
||||
&& blk.bimg_len == BLCKSZ
|
||||
{
|
||||
if !blk_img_is_compressed && blk.bimg_len == BLCKSZ {
|
||||
// TODO
|
||||
/*
|
||||
report_invalid_record(state,
|
||||
@@ -683,7 +703,7 @@ pub fn decode_wal_record(
|
||||
* IS_COMPRESSED flag is set.
|
||||
*/
|
||||
if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
|
||||
&& blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED == 0
|
||||
&& !blk_img_is_compressed
|
||||
&& blk.bimg_len != BLCKSZ
|
||||
{
|
||||
// TODO
|
||||
|
||||
@@ -21,7 +21,6 @@
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use nix::poll::*;
|
||||
use remote_storage::path_with_suffix_extension;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
@@ -36,6 +35,7 @@ use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::crashsafe_dir::path_with_suffix_extension;
|
||||
use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
|
||||
|
||||
use crate::metrics::{
|
||||
@@ -46,11 +46,12 @@ use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
use postgres_ffi::v14::nonrelfile_utils::{
|
||||
mx_offset_to_flags_bitshift, mx_offset_to_flags_offset, mx_offset_to_member_offset,
|
||||
transaction_id_set_status,
|
||||
};
|
||||
use postgres_ffi::v14::pg_constants;
|
||||
use postgres_ffi::BLCKSZ;
|
||||
|
||||
///
|
||||
@@ -82,6 +83,7 @@ pub trait WalRedoManager: Send + Sync {
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
@@ -144,6 +146,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
if records.is_empty() {
|
||||
error!("invalid WAL redo request with no records");
|
||||
@@ -166,6 +169,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
img,
|
||||
&records[batch_start..i],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
};
|
||||
img = Some(result?);
|
||||
@@ -184,6 +188,7 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
img,
|
||||
&records[batch_start..],
|
||||
self.conf.wal_redo_timeout,
|
||||
pg_version,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -212,6 +217,7 @@ impl PostgresRedoManager {
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
|
||||
@@ -222,7 +228,7 @@ impl PostgresRedoManager {
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
if process_guard.is_none() {
|
||||
let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id)?;
|
||||
let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
@@ -326,7 +332,7 @@ impl PostgresRedoManager {
|
||||
// sanity check that this is modifying the correct relation
|
||||
let (rel, blknum) = key_to_rel_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert!(
|
||||
rel.forknum == pg_constants::VISIBILITYMAP_FORKNUM,
|
||||
rel.forknum == VISIBILITYMAP_FORKNUM,
|
||||
"ClearVisibilityMapFlags record on unexpected rel {}",
|
||||
rel
|
||||
);
|
||||
@@ -570,7 +576,11 @@ impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
fn launch(conf: &PageServerConf, tenant_id: &TenantId) -> Result<PostgresRedoProcess, Error> {
|
||||
fn launch(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
pg_version: u32,
|
||||
) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
@@ -588,12 +598,12 @@ impl PostgresRedoProcess {
|
||||
fs::remove_dir_all(&datadir)?;
|
||||
}
|
||||
info!("running initdb in {}", datadir.display());
|
||||
let initdb = Command::new(conf.pg_bin_dir().join("initdb"))
|
||||
let initdb = Command::new(conf.pg_bin_dir(pg_version).join("initdb"))
|
||||
.args(&["-D", &datadir.to_string_lossy()])
|
||||
.arg("-N")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.close_fds()
|
||||
.output()
|
||||
.map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {e}")))?;
|
||||
@@ -619,14 +629,14 @@ impl PostgresRedoProcess {
|
||||
}
|
||||
|
||||
// Start postgres itself
|
||||
let mut child = Command::new(conf.pg_bin_dir().join("postgres"))
|
||||
let mut child = Command::new(conf.pg_bin_dir(pg_version).join("postgres"))
|
||||
.arg("--wal-redo")
|
||||
.stdin(Stdio::piped())
|
||||
.stderr(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir(pg_version))
|
||||
.env("PGDATA", &datadir)
|
||||
// The redo process is not trusted, so it runs in seccomp mode
|
||||
// (see seccomp in zenith_wal_redo.c). We have to make sure it doesn't
|
||||
|
||||
@@ -183,7 +183,7 @@ pageserver_send(NeonRequest * request)
|
||||
if (!connected)
|
||||
pageserver_connect();
|
||||
|
||||
req_buff = zm_pack_request(request);
|
||||
req_buff = nm_pack_request(request);
|
||||
|
||||
/*
|
||||
* Send request.
|
||||
@@ -204,7 +204,7 @@ pageserver_send(NeonRequest * request)
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = zm_to_string((NeonMessage *) request);
|
||||
char *msg = nm_to_string((NeonMessage *) request);
|
||||
|
||||
neon_log(PageStoreTrace, "sent request: %s", msg);
|
||||
pfree(msg);
|
||||
@@ -230,12 +230,12 @@ pageserver_receive(void)
|
||||
else if (resp_buff.len == -2)
|
||||
neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
|
||||
}
|
||||
resp = zm_unpack_response(&resp_buff);
|
||||
resp = nm_unpack_response(&resp_buff);
|
||||
PQfreemem(resp_buff.data);
|
||||
|
||||
if (message_level_is_interesting(PageStoreTrace))
|
||||
{
|
||||
char *msg = zm_to_string((NeonMessage *) resp);
|
||||
char *msg = nm_to_string((NeonMessage *) resp);
|
||||
|
||||
neon_log(PageStoreTrace, "got response: %s", msg);
|
||||
pfree(msg);
|
||||
@@ -341,9 +341,9 @@ page_server_api api = {
|
||||
static bool
|
||||
check_neon_id(char **newval, void **extra, GucSource source)
|
||||
{
|
||||
uint8 zid[16];
|
||||
uint8 id[16];
|
||||
|
||||
return **newval == '\0' || HexDecodeString(zid, *newval, 16);
|
||||
return **newval == '\0' || HexDecodeString(id, *newval, 16);
|
||||
}
|
||||
|
||||
static char *
|
||||
|
||||
@@ -128,9 +128,9 @@ typedef struct
|
||||
* message */
|
||||
} NeonErrorResponse;
|
||||
|
||||
extern StringInfoData zm_pack_request(NeonRequest * msg);
|
||||
extern NeonResponse * zm_unpack_response(StringInfo s);
|
||||
extern char *zm_to_string(NeonMessage * msg);
|
||||
extern StringInfoData nm_pack_request(NeonRequest * msg);
|
||||
extern NeonResponse * nm_unpack_response(StringInfo s);
|
||||
extern char *nm_to_string(NeonMessage * msg);
|
||||
|
||||
/*
|
||||
* API
|
||||
|
||||
@@ -160,7 +160,7 @@ page_server_request(void const *req)
|
||||
|
||||
|
||||
StringInfoData
|
||||
zm_pack_request(NeonRequest * msg)
|
||||
nm_pack_request(NeonRequest * msg)
|
||||
{
|
||||
StringInfoData s;
|
||||
|
||||
@@ -235,7 +235,7 @@ zm_pack_request(NeonRequest * msg)
|
||||
}
|
||||
|
||||
NeonResponse *
|
||||
zm_unpack_response(StringInfo s)
|
||||
nm_unpack_response(StringInfo s)
|
||||
{
|
||||
NeonMessageTag tag = pq_getmsgbyte(s);
|
||||
NeonResponse *resp = NULL;
|
||||
@@ -329,7 +329,7 @@ zm_unpack_response(StringInfo s)
|
||||
|
||||
/* dump to json for debugging / error reporting purposes */
|
||||
char *
|
||||
zm_to_string(NeonMessage * msg)
|
||||
nm_to_string(NeonMessage * msg)
|
||||
{
|
||||
StringInfoData s;
|
||||
|
||||
@@ -632,7 +632,7 @@ neon_init(void)
|
||||
* It may cause problems with XLogFlush. So return pointer backward to the origin of the page.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
zm_adjust_lsn(XLogRecPtr lsn)
|
||||
nm_adjust_lsn(XLogRecPtr lsn)
|
||||
{
|
||||
/*
|
||||
* If lsn points to the beging of first record on page or segment, then
|
||||
@@ -685,7 +685,7 @@ neon_get_request_lsn(bool *latest, RelFileNode rnode, ForkNumber forknum, BlockN
|
||||
elog(DEBUG1, "neon_get_request_lsn GetLastWrittenLSN lsn %X/%X ",
|
||||
(uint32) ((lsn) >> 32), (uint32) (lsn));
|
||||
|
||||
lsn = zm_adjust_lsn(lsn);
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
/*
|
||||
* Is it possible that the last-written LSN is ahead of last flush
|
||||
@@ -959,7 +959,17 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
|
||||
if (IS_LOCAL_REL(reln))
|
||||
mdextend(reln, forkNum, blkno, buffer, skipFsync);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* smgr_extend is often called with an all-zeroes page, so lsn==InvalidXLogRecPtr.
|
||||
* An smgr_write() call will come for the buffer later, after it has been initialized
|
||||
* with the real page contents, and it is eventually evicted from the buffer cache.
|
||||
* But we need a valid LSN to the relation metadata update now.
|
||||
*/
|
||||
if (lsn == InvalidXLogRecPtr)
|
||||
{
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
SetLastWrittenLSNForBlock(lsn, reln->smgr_rnode.node, forkNum, blkno);
|
||||
}
|
||||
SetLastWrittenLSNForRelation(lsn, reln->smgr_rnode.node, forkNum);
|
||||
}
|
||||
|
||||
@@ -1559,7 +1569,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
|
||||
*/
|
||||
lsn = GetXLogInsertRecPtr();
|
||||
|
||||
lsn = zm_adjust_lsn(lsn);
|
||||
lsn = nm_adjust_lsn(lsn);
|
||||
|
||||
/*
|
||||
* Flush it, too. We don't actually care about it here, but let's uphold
|
||||
|
||||
@@ -11,13 +11,14 @@ bstr = "0.2.17"
|
||||
bytes = { version = "1.0.1", features = ['serde'] }
|
||||
clap = "3.0"
|
||||
futures = "0.3.13"
|
||||
git-version = "0.3.5"
|
||||
hashbrown = "0.12"
|
||||
hex = "0.4.3"
|
||||
hmac = "0.12.1"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
once_cell = "1.13.0"
|
||||
md5 = "0.7.0"
|
||||
once_cell = "1.13.0"
|
||||
parking_lot = "0.12"
|
||||
pin-project-lite = "0.2.7"
|
||||
rand = "0.8.3"
|
||||
@@ -35,14 +36,13 @@ tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-rustls = "0.23.0"
|
||||
url = "2.2.2"
|
||||
git-version = "0.3.5"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"]}
|
||||
x509-parser = "0.13.2"
|
||||
|
||||
utils = { path = "../libs/utils" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
x509-parser = "0.13.2"
|
||||
|
||||
[dev-dependencies]
|
||||
rcgen = "0.8.14"
|
||||
rstest = "0.12"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user