mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-21 20:32:56 +00:00
Compare commits
7 Commits
parallel_w
...
temp-flaky
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8a44796905 | ||
|
|
ed521e05e7 | ||
|
|
6a13500da4 | ||
|
|
9b8168ebde | ||
|
|
f9bb4dbf08 | ||
|
|
20ee204c27 | ||
|
|
3fdd85bcb8 |
@@ -1,19 +1,20 @@
|
||||
version: 2.1
|
||||
|
||||
orbs:
|
||||
python: circleci/python@1.4.0
|
||||
|
||||
executors:
|
||||
zenith-build-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
- image: cimg/rust:1.56.1
|
||||
zenith-python-executor:
|
||||
docker:
|
||||
- image: cimg/python:3.7.10 # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI
|
||||
- image: cimg/rust:1.52.1
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
check-codestyle:
|
||||
executor: zenith-build-executor
|
||||
steps:
|
||||
- checkout
|
||||
|
||||
- run:
|
||||
name: rustfmt
|
||||
when: always
|
||||
@@ -23,12 +24,6 @@ jobs:
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: zenith-build-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
|
||||
- checkout
|
||||
@@ -44,7 +39,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# FIXME We could cache our own docker container, instead of installing packages every time.
|
||||
- run:
|
||||
@@ -64,12 +59,12 @@ jobs:
|
||||
if [ ! -e tmp_install/bin/postgres ]; then
|
||||
# "depth 1" saves some time by not cloning the whole repo
|
||||
git submodule update --init --depth 1
|
||||
make postgres -j8
|
||||
make postgres
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
key: v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
@@ -80,8 +75,6 @@ jobs:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
@@ -103,7 +96,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v03-postgres-cache-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
@@ -111,26 +104,25 @@ jobs:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
- v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
|
||||
# Build the rust code, including test binaries
|
||||
- run:
|
||||
name: Rust build << parameters.build_type >>
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
export CARGO_INCREMENTAL=0
|
||||
"${cov_prefix[@]}" cargo build $CARGO_FLAGS --bins --tests
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Build in debug mode"
|
||||
cargo build --bins --tests
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Build in release mode"
|
||||
cargo build --release --bins --tests
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
key: v03-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
paths:
|
||||
- ~/.cargo/registry
|
||||
- ~/.cargo/git
|
||||
@@ -140,100 +132,53 @@ jobs:
|
||||
# has to run separately from cargo fmt section
|
||||
# since needs to run with dependencies
|
||||
- run:
|
||||
name: cargo clippy
|
||||
name: clippy
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" ./run_clippy.sh
|
||||
./run_clippy.sh
|
||||
|
||||
# Run rust unit tests
|
||||
- run:
|
||||
name: cargo test
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" cargo test
|
||||
- run: cargo test
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
# `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
|
||||
# FIXME: this is a really silly way to install; maybe we should just output
|
||||
# a tarball as an artifact? Or a .deb package?
|
||||
- run:
|
||||
name: Install rust binaries
|
||||
name: cargo install
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
echo "Install debug mode"
|
||||
CARGO_FLAGS="--debug"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
binaries=$(
|
||||
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
|
||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||
)
|
||||
|
||||
test_exe_paths=$(
|
||||
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
|
||||
mkdir -p /tmp/zenith/bin
|
||||
mkdir -p /tmp/zenith/test_bin
|
||||
mkdir -p /tmp/zenith/etc
|
||||
|
||||
# Install target binaries
|
||||
for bin in $binaries; do
|
||||
SRC=target/$BUILD_TYPE/$bin
|
||||
DST=/tmp/zenith/bin/$bin
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
||||
done
|
||||
|
||||
# Install test executables (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
for bin in $test_exe_paths; do
|
||||
SRC=$bin
|
||||
DST=/tmp/zenith/test_bin/$(basename $bin)
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
||||
done
|
||||
echo "Install release mode"
|
||||
# The default is release mode; there is no --release flag.
|
||||
CARGO_FLAGS=""
|
||||
fi
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
|
||||
|
||||
# Install the postgres binaries, for use by test jobs
|
||||
# FIXME: this is a silly way to do "install"; maybe just output a standard
|
||||
# postgres package, whatever the favored form is (tarball? .deb package?)
|
||||
# Note that pg_regress needs some build artifacts that probably aren't
|
||||
# in the usual package...?
|
||||
- run:
|
||||
name: Install postgres binaries
|
||||
name: postgres install
|
||||
command: |
|
||||
cp -a tmp_install /tmp/zenith/pg_install
|
||||
|
||||
# Save the rust binaries and coverage data for other jobs in this workflow.
|
||||
# Save the rust output binaries for other jobs in this workflow.
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
check-codestyle-python:
|
||||
executor: zenith-python-executor
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Install deps
|
||||
command: pipenv --python 3.7 install --dev
|
||||
- run:
|
||||
name: Run yapf to ensure code format
|
||||
when: always
|
||||
command: pipenv run yapf --recursive --diff .
|
||||
- run:
|
||||
name: Run mypy to check types
|
||||
when: always
|
||||
command: pipenv run mypy .
|
||||
|
||||
run-pytest:
|
||||
executor: zenith-python-executor
|
||||
#description: "Run pytest"
|
||||
executor: python/default
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
@@ -259,11 +204,6 @@ jobs:
|
||||
run_in_parallel:
|
||||
type: boolean
|
||||
default: true
|
||||
save_perf_report:
|
||||
type: boolean
|
||||
default: false
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
@@ -273,74 +213,39 @@ jobs:
|
||||
steps:
|
||||
- run: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Install deps
|
||||
command: pipenv --python 3.7 install
|
||||
name: Install pipenv & deps
|
||||
working_directory: test_runner
|
||||
command: |
|
||||
pip install pipenv
|
||||
pipenv install
|
||||
- run:
|
||||
name: Run pytest
|
||||
# pytest doesn't output test logs in real time, so CI job may fail with
|
||||
# `Too long with no output` error, if a test is running for a long time.
|
||||
# In that case, tests should have internal timeouts that are less than
|
||||
# no_output_timeout, specified here.
|
||||
no_output_timeout: 10m
|
||||
working_directory: test_runner
|
||||
environment:
|
||||
- ZENITH_BIN: /tmp/zenith/bin
|
||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||
- TEST_OUTPUT: /tmp/test_output
|
||||
# this variable will be embedded in perf test report
|
||||
# and is needed to distinguish different environments
|
||||
- PLATFORM: zenith-local-ci
|
||||
command: |
|
||||
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
|
||||
|
||||
TEST_SELECTION="test_runner/<< parameters.test_selection >>"
|
||||
TEST_SELECTION="<< parameters.test_selection >>"
|
||||
EXTRA_PARAMS="<< parameters.extra_params >>"
|
||||
if [ -z "$TEST_SELECTION" ]; then
|
||||
echo "test_selection must be set"
|
||||
exit 1
|
||||
fi
|
||||
if << parameters.run_in_parallel >>; then
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
fi
|
||||
if << parameters.save_perf_report >>; then
|
||||
if [[ $CIRCLE_BRANCH == "main" ]]; then
|
||||
mkdir -p "$PERF_REPORT_DIR"
|
||||
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||
fi
|
||||
fi
|
||||
|
||||
export GITHUB_SHA=$CIRCLE_SHA1
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
|
||||
fi;
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
# -s prevents pytest from capturing output, which helps to see
|
||||
# what's going on if the test hangs
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
"${cov_prefix[@]}" pipenv run pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
-m "not remote_cluster" \
|
||||
-rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
|
||||
if << parameters.save_perf_report >>; then
|
||||
if [[ $CIRCLE_BRANCH == "main" ]]; then
|
||||
# TODO: reuse scripts/git-upload
|
||||
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||
export REPORT_TO=local
|
||||
scripts/generate_and_push_perf_report.sh
|
||||
fi
|
||||
fi
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
@@ -349,72 +254,13 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "wal_acceptor.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||
- store_test_results:
|
||||
path: /tmp/test_output
|
||||
# Save coverage data (if any)
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
coverage-report:
|
||||
executor: zenith-build-executor
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
- checkout
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
keys:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
|
||||
- run:
|
||||
name: Install llvm-tools
|
||||
command: |
|
||||
# TODO: install a proper symbol demangler, e.g. rustfilt
|
||||
# TODO: we should embed this into a docker image
|
||||
rustup component add llvm-tools-preview
|
||||
- run:
|
||||
name: Build coverage report
|
||||
command: |
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/coverage \
|
||||
--dir=/tmp/zenith/coverage report \
|
||||
--input-objects=/tmp/zenith/etc/binaries.list \
|
||||
--commit-url=$COMMIT_URL \
|
||||
--format=github
|
||||
- run:
|
||||
name: Upload coverage report
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
||||
|
||||
# Add link to the coverage report to the commit
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "$CI_ACCESS_TOKEN" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"success\",
|
||||
\"context\": \"zenith-coverage\",
|
||||
\"description\": \"Coverage report is ready\",
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
# Build zenithdb/zenith:latest image and push it to Docker hub
|
||||
docker-image:
|
||||
@@ -431,7 +277,7 @@ jobs:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
|
||||
docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
|
||||
|
||||
# Trigger a new remote CI job
|
||||
remote-ci-trigger:
|
||||
@@ -473,27 +319,23 @@ jobs:
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"zenith-remote-ci\",
|
||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
||||
\"remote_repo\": \"$LOCAL_REPO\"
|
||||
\"remote_repo\": \"$LOCAL_REPO\",
|
||||
\"zenith_image_branch\": \"$CIRCLE_BRANCH\"
|
||||
}
|
||||
}"
|
||||
|
||||
workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- check-codestyle-rust
|
||||
- check-codestyle-python
|
||||
- build-postgres:
|
||||
name: build-postgres-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- check-codestyle
|
||||
- build-postgres
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
requires:
|
||||
- build-postgres-<< matrix.build_type >>
|
||||
- build-postgres
|
||||
- run-pytest:
|
||||
name: pg_regress-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
@@ -516,15 +358,8 @@ workflows:
|
||||
build_type: release
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
requires:
|
||||
- build-zenith-release
|
||||
- coverage-report:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
requires:
|
||||
# TODO: consider adding more
|
||||
- other-tests-debug
|
||||
- docker-image:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
|
||||
@@ -11,8 +11,4 @@ test_output
|
||||
.vscode
|
||||
.zenith
|
||||
integration_tests/.zenith
|
||||
.mypy_cache
|
||||
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
|
||||
.mypy_cache
|
||||
114
.github/workflows/benchmarking.yml
vendored
114
.github/workflows/benchmarking.yml
vendored
@@ -1,114 +0,0 @@
|
||||
name: benchmarking
|
||||
|
||||
on:
|
||||
# uncomment to run on push for debugging your PR
|
||||
# push:
|
||||
# branches: [ mybranch ]
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '36 7 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
env:
|
||||
BASE_URL: "https://console.zenith.tech"
|
||||
|
||||
jobs:
|
||||
bench:
|
||||
# this workflow runs on self hosteed runner
|
||||
# it's environment is quite different from usual guthub runner
|
||||
# probably the most important difference is that it doesnt start from clean workspace each time
|
||||
# e g if you install system packages they are not cleaned up since you install them directly in host machine
|
||||
# not a container or something
|
||||
# See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
PG_BIN: "/usr/pgsql-13/bin"
|
||||
|
||||
steps:
|
||||
- name: Checkout zenith repo
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# actions/setup-python@v2 is not working correctly on self-hosted runners
|
||||
# see https://github.com/actions/setup-python/issues/162
|
||||
# and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
|
||||
# so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
|
||||
# there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
|
||||
- name: Install pipenv & deps
|
||||
run: |
|
||||
python3 -m pip install --upgrade pipenv wheel
|
||||
# since pip/pipenv caches are reused there shouldn't be any troubles with install every time
|
||||
pipenv install
|
||||
|
||||
- name: Show versions
|
||||
run: |
|
||||
echo Python
|
||||
python3 --version
|
||||
pipenv run python3 --version
|
||||
echo Pipenv
|
||||
pipenv --version
|
||||
echo Pgbench
|
||||
$PG_BIN/pgbench --version
|
||||
|
||||
# FIXME cluster setup is skipped due to various changes in console API
|
||||
# for now pre created cluster is used. When API gain some stability
|
||||
# after massive changes dynamic cluster setup will be revived.
|
||||
# So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
|
||||
- name: Setup cluster
|
||||
env:
|
||||
BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
|
||||
BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
|
||||
BENCHMARK_CLUSTER_ID: "${{ secrets.BENCHMARK_CLUSTER_ID }}"
|
||||
shell: bash
|
||||
run: |
|
||||
set -e
|
||||
|
||||
echo "Starting cluster"
|
||||
CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID/start \
|
||||
-H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
|
||||
echo $CLUSTER | python -m json.tool
|
||||
|
||||
echo "Waiting for cluster to become ready"
|
||||
sleep 10
|
||||
|
||||
echo "CLUSTER_ID=$BENCHMARK_CLUSTER_ID" >> $GITHUB_ENV
|
||||
CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID.json \
|
||||
-H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
|
||||
echo $CLUSTER | python -m json.tool
|
||||
|
||||
- name: Run benchmark
|
||||
# pgbench is installed system wide from official repo
|
||||
# https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
|
||||
# via
|
||||
# sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
|
||||
# [pgdg13]
|
||||
# name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
|
||||
# baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
|
||||
# enabled=1
|
||||
# gpgcheck=0
|
||||
# EOF
|
||||
# sudo yum makecache
|
||||
# sudo yum install postgresql13-contrib
|
||||
# actual binaries are located in /usr/pgsql-13/bin/
|
||||
env:
|
||||
TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,15"
|
||||
PLATFORM: "zenith-staging"
|
||||
BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||
run: |
|
||||
mkdir -p perf-report-staging
|
||||
pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
|
||||
|
||||
- name: Submit result
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
run: |
|
||||
REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -7,7 +7,3 @@ test_output/
|
||||
.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
*.profdata
|
||||
|
||||
10
.yapfignore
10
.yapfignore
@@ -1,10 +0,0 @@
|
||||
# This file is only read when `yapf` is run from this directory.
|
||||
# Hence we only top-level directories here to avoid confusion.
|
||||
# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
|
||||
vendor/
|
||||
target/
|
||||
tmp_install/
|
||||
__pycache__/
|
||||
test_output/
|
||||
.zenith/
|
||||
.git/
|
||||
683
Cargo.lock
generated
683
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -10,7 +10,6 @@ FROM zenithdb/build:buster AS pg-build
|
||||
WORKDIR /zenith
|
||||
COPY ./vendor/postgres vendor/postgres
|
||||
COPY ./Makefile Makefile
|
||||
ENV BUILD_TYPE release
|
||||
RUN make -j $(getconf _NPROCESSORS_ONLN) -s postgres
|
||||
RUN rm -rf postgres_install/build
|
||||
|
||||
@@ -21,15 +20,11 @@ RUN rm -rf postgres_install/build
|
||||
# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
|
||||
#
|
||||
FROM zenithdb/build:buster AS build
|
||||
|
||||
ARG GIT_VERSION
|
||||
RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
|
||||
|
||||
WORKDIR /zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
|
||||
COPY . .
|
||||
RUN GIT_VERSION=$GIT_VERSION cargo build --release
|
||||
RUN cargo build --release
|
||||
|
||||
#
|
||||
# Copy binaries to resulting image.
|
||||
@@ -41,7 +36,7 @@ RUN apt-get update && apt-get -yq install libreadline-dev libseccomp-dev openssl
|
||||
mkdir zenith_install
|
||||
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install postgres_install
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
@@ -81,7 +81,7 @@ FROM alpine:3.13
|
||||
RUN apk add --update openssl build-base libseccomp-dev
|
||||
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
|
||||
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/wal_acceptor /usr/local/bin
|
||||
COPY --from=build /zenith/target/release/proxy /usr/local/bin
|
||||
COPY --from=pg-build /zenith/tmp_install /usr/local
|
||||
COPY docker-entrypoint.sh /docker-entrypoint.sh
|
||||
|
||||
58
Makefile
58
Makefile
@@ -6,55 +6,34 @@ else
|
||||
SECCOMP =
|
||||
endif
|
||||
|
||||
#
|
||||
# We differentiate between release / debug build types using the BUILD_TYPE
|
||||
# environment variable.
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
else
|
||||
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
CARGO_BUILD_FLAGS += --$(if $(filter s,$(MAKEFLAGS)),quiet,verbose)
|
||||
# Fix for a corner case when make doesn't pass a jobserver
|
||||
CARGO_BUILD_FLAGS += $(filter -j1,$(MAKEFLAGS))
|
||||
|
||||
# This option has a side effect of passing make jobserver to cargo.
|
||||
# However, we shouldn't do this if `make -n` (--dry-run) has been asked.
|
||||
CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
|
||||
# Force cargo not to print progress bar
|
||||
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
|
||||
|
||||
#
|
||||
# Top level Makefile to build Zenith and PostgreSQL
|
||||
#
|
||||
.PHONY: all
|
||||
all: zenith postgres
|
||||
|
||||
# We don't want to run 'cargo build' in parallel with the postgres build,
|
||||
# because interleaving cargo build output with postgres build output looks
|
||||
# confusing. Also, 'cargo build' is parallel on its own, so it would be too
|
||||
# much parallelism. (Recursive invocation of postgres target still gets any
|
||||
# '-j' flag from the command line, so 'make -j' is still useful.)
|
||||
.NOTPARALLEL:
|
||||
|
||||
### Zenith Rust bits
|
||||
#
|
||||
# The 'postgres_ffi' depends on the Postgres headers.
|
||||
.PHONY: zenith
|
||||
zenith: postgres-headers
|
||||
+@echo "Compiling Zenith"
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
cargo build
|
||||
|
||||
### PostgreSQL parts
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
../../vendor/postgres/configure CFLAGS='-O0 -g3 $(CFLAGS)' \
|
||||
--enable-cassert \
|
||||
--enable-debug \
|
||||
--enable-depend \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
@@ -68,10 +47,10 @@ postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
|
||||
# Compile and install PostgreSQL and contrib/zenith
|
||||
.PHONY: postgres
|
||||
postgres: postgres-configure \
|
||||
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
||||
postgres: postgres-configure
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/zenith"
|
||||
@@ -79,21 +58,18 @@ postgres: postgres-configure \
|
||||
+@echo "Compiling contrib/zenith_test_utils"
|
||||
$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean:
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cd tmp_install/build && $(MAKE) clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
cd tmp_install/build && ${MAKE} clean
|
||||
cargo clean
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
rm -rf tmp_install
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
cargo clean
|
||||
|
||||
.PHONY: fmt
|
||||
fmt:
|
||||
|
||||
30
Pipfile
30
Pipfile
@@ -1,30 +0,0 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
pytest = ">=6.0.0"
|
||||
typing-extensions = "*"
|
||||
pyjwt = {extras = ["crypto"], version = "*"}
|
||||
requests = "*"
|
||||
pytest-xdist = "*"
|
||||
asyncpg = "*"
|
||||
cached-property = "*"
|
||||
psycopg2-binary = "*"
|
||||
jinja2 = "*"
|
||||
|
||||
[dev-packages]
|
||||
# Behavior may change slightly between versions. These are run continuously,
|
||||
# so we pin exact versions to avoid suprising breaks. Update if comfortable.
|
||||
yapf = "==0.31.0"
|
||||
mypy = "==0.910"
|
||||
# Non-pinned packages follow.
|
||||
pipenv = "*"
|
||||
flake8 = "*"
|
||||
types-requests = "*"
|
||||
types-psycopg2 = "*"
|
||||
|
||||
[requires]
|
||||
# we need at least 3.7, but pipenv doesn't allow to say this directly
|
||||
python_version = "3"
|
||||
652
Pipfile.lock
generated
652
Pipfile.lock
generated
@@ -1,652 +0,0 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"asyncpg": {
|
||||
"hashes": [
|
||||
"sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
|
||||
"sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
|
||||
"sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
|
||||
"sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
|
||||
"sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
|
||||
"sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
|
||||
"sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
|
||||
"sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
|
||||
"sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
|
||||
"sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
|
||||
"sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
|
||||
"sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
|
||||
"sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.24.0"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"cached-property": {
|
||||
"hashes": [
|
||||
"sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
|
||||
"sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.5.2"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
|
||||
"sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
|
||||
"sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
|
||||
"sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
|
||||
"sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
|
||||
"sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
|
||||
"sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
|
||||
"sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
|
||||
"sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
|
||||
"sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
|
||||
"sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
|
||||
"sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
|
||||
"sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
|
||||
"sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
|
||||
"sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
|
||||
"sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
|
||||
"sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
|
||||
"sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
|
||||
"sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
|
||||
"sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
|
||||
"sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
|
||||
"sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
|
||||
"sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
|
||||
"sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
|
||||
"sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
|
||||
"sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
|
||||
"sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
|
||||
"sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
|
||||
"sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
|
||||
"sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
|
||||
"sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
|
||||
"sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
|
||||
"sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
|
||||
"sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
|
||||
"sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
|
||||
"sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
|
||||
"sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
|
||||
"sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
|
||||
"sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
|
||||
"sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
|
||||
"sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
|
||||
"sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
|
||||
"sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
|
||||
"sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
|
||||
"sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
|
||||
"sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
|
||||
"sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
|
||||
"sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
|
||||
"sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
|
||||
"sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
|
||||
],
|
||||
"version": "==1.15.0"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
|
||||
"sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.7"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
|
||||
"sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
|
||||
"sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
|
||||
"sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
|
||||
"sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
|
||||
"sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
|
||||
"sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
|
||||
"sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
|
||||
"sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
|
||||
"sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
|
||||
"sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
|
||||
"sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
|
||||
"sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
|
||||
"sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
|
||||
"sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
|
||||
"sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
|
||||
"sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
|
||||
"sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
|
||||
"sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
|
||||
"sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
|
||||
],
|
||||
"version": "==35.0.0"
|
||||
},
|
||||
"execnet": {
|
||||
"hashes": [
|
||||
"sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
|
||||
"sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.9.0"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
|
||||
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.8.1"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"jinja2": {
|
||||
"hashes": [
|
||||
"sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
|
||||
"sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.0.2"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
|
||||
"sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
|
||||
"sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
|
||||
"sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
|
||||
"sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
|
||||
"sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
|
||||
"sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
|
||||
"sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
|
||||
"sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
|
||||
"sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
|
||||
"sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
|
||||
"sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
|
||||
"sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
|
||||
"sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
|
||||
"sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
|
||||
"sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
|
||||
"sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
|
||||
"sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
|
||||
"sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
|
||||
"sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
|
||||
"sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
|
||||
"sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
|
||||
"sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
|
||||
"sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
|
||||
"sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
|
||||
"sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
|
||||
"sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
|
||||
"sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
|
||||
"sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
|
||||
"sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
|
||||
"sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
|
||||
"sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
|
||||
"sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
|
||||
"sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
|
||||
"sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
|
||||
"sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
|
||||
"sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
|
||||
"sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
|
||||
"sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
|
||||
"sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
|
||||
"sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
|
||||
"sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
|
||||
"sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
|
||||
"sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
|
||||
"sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
|
||||
"sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
|
||||
"sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
|
||||
"sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
|
||||
"sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
|
||||
"sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
|
||||
"sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
|
||||
"sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
|
||||
"sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
|
||||
"sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
|
||||
"sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
|
||||
"sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
|
||||
"sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
|
||||
"sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
|
||||
"sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
|
||||
"sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
|
||||
"sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
|
||||
"sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
|
||||
"sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
|
||||
"sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
|
||||
"sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
|
||||
"sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
|
||||
"sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
|
||||
"sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
|
||||
"sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
|
||||
"sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==21.2"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"psycopg2-binary": {
|
||||
"hashes": [
|
||||
"sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
|
||||
"sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
|
||||
"sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
|
||||
"sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
|
||||
"sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
|
||||
"sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
|
||||
"sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
|
||||
"sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
|
||||
"sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
|
||||
"sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
|
||||
"sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
|
||||
"sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
|
||||
"sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
|
||||
"sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
|
||||
"sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
|
||||
"sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
|
||||
"sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
|
||||
"sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
|
||||
"sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
|
||||
"sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
|
||||
"sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
|
||||
"sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
|
||||
"sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
|
||||
"sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
|
||||
"sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
|
||||
"sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
|
||||
"sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
|
||||
"sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
|
||||
"sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
|
||||
"sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
|
||||
"sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
|
||||
"sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
|
||||
"sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
|
||||
"sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
|
||||
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.10.0"
|
||||
},
|
||||
"pycparser": {
|
||||
"hashes": [
|
||||
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
|
||||
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
},
|
||||
"pyjwt": {
|
||||
"extras": [
|
||||
"crypto"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
|
||||
"sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
|
||||
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.7"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
|
||||
"sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.2.5"
|
||||
},
|
||||
"pytest-forked": {
|
||||
"hashes": [
|
||||
"sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
|
||||
"sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"pytest-xdist": {
|
||||
"hashes": [
|
||||
"sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
|
||||
"sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.26.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
|
||||
"sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.7"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
|
||||
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.6.0"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"backports.entry-points-selectable": {
|
||||
"hashes": [
|
||||
"sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
|
||||
"sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
|
||||
],
|
||||
"markers": "python_version >= '2.7'",
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"distlib": {
|
||||
"hashes": [
|
||||
"sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
|
||||
"sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
|
||||
],
|
||||
"version": "==0.3.3"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
"sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
|
||||
"sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.3.2"
|
||||
},
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
|
||||
"sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.0.1"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
|
||||
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.8.1"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"mypy": {
|
||||
"hashes": [
|
||||
"sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
|
||||
"sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
|
||||
"sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
|
||||
"sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
|
||||
"sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
|
||||
"sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
|
||||
"sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
|
||||
"sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
|
||||
"sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
|
||||
"sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
|
||||
"sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
|
||||
"sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
|
||||
"sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
|
||||
"sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
|
||||
"sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
|
||||
"sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
|
||||
"sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
|
||||
"sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
|
||||
"sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
|
||||
"sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
|
||||
"sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
|
||||
"sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
|
||||
"sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.910"
|
||||
},
|
||||
"mypy-extensions": {
|
||||
"hashes": [
|
||||
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
|
||||
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
|
||||
],
|
||||
"version": "==0.4.3"
|
||||
},
|
||||
"pipenv": {
|
||||
"hashes": [
|
||||
"sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
|
||||
"sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2021.5.29"
|
||||
},
|
||||
"platformdirs": {
|
||||
"hashes": [
|
||||
"sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
|
||||
"sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
|
||||
"sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==2.8.0"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
|
||||
"sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typed-ast": {
|
||||
"hashes": [
|
||||
"sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
|
||||
"sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
|
||||
"sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
|
||||
"sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
|
||||
"sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
|
||||
"sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
|
||||
"sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
|
||||
"sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
|
||||
"sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
|
||||
"sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
|
||||
"sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
|
||||
"sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
|
||||
"sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
|
||||
"sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
|
||||
"sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
|
||||
"sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
|
||||
"sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
|
||||
"sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
|
||||
"sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
|
||||
"sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
|
||||
"sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
|
||||
"sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
|
||||
"sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
|
||||
"sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
|
||||
"sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
|
||||
"sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
|
||||
"sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
|
||||
"sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
|
||||
"sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
|
||||
"sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==1.4.3"
|
||||
},
|
||||
"types-psycopg2": {
|
||||
"hashes": [
|
||||
"sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
|
||||
"sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"types-requests": {
|
||||
"hashes": [
|
||||
"sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
|
||||
"sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.25.11"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"virtualenv": {
|
||||
"hashes": [
|
||||
"sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
|
||||
"sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==20.10.0"
|
||||
},
|
||||
"virtualenv-clone": {
|
||||
"hashes": [
|
||||
"sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
|
||||
"sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.5.7"
|
||||
},
|
||||
"yapf": {
|
||||
"hashes": [
|
||||
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
|
||||
"sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.31.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
|
||||
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.6.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
1
Pipfile.lock
generated
Symbolic link
1
Pipfile.lock
generated
Symbolic link
@@ -0,0 +1 @@
|
||||
./test_runner/Pipfile.lock
|
||||
33
README.md
33
README.md
@@ -25,15 +25,15 @@ Pageserver consists of:
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
libssl-dev clang
|
||||
```
|
||||
|
||||
[Rust] 1.55 or later is also required.
|
||||
[Rust] 1.52 or later is also required.
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory.
|
||||
To run the integration tests (not required to use the code), install
|
||||
Python (3.6 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
|
||||
|
||||
2. Build zenith and patched postgres
|
||||
```sh
|
||||
@@ -47,26 +47,17 @@ make -j5
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/zenith init
|
||||
initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
|
||||
created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
|
||||
created main branch
|
||||
pageserver init succeeded
|
||||
|
||||
# start pageserver and safekeeper
|
||||
# start pageserver
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at 'localhost:64000' in '.zenith'
|
||||
Starting pageserver at '127.0.0.1:64000' in .zenith
|
||||
Pageserver started
|
||||
initializing for single for 7676
|
||||
Starting safekeeper at 'localhost:5454' in '.zenith/safekeepers/single'
|
||||
Safekeeper started
|
||||
|
||||
# start postgres compute node
|
||||
# start postgres on top on the pageserver
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting new postgres main on main...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=stas'
|
||||
waiting for server to start.... done
|
||||
server started
|
||||
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/zenith pg list
|
||||
@@ -117,19 +108,13 @@ postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
```
|
||||
|
||||
6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
you have just started. You can stop them all with one command:
|
||||
```sh
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
cd test_runner
|
||||
pipenv run pytest
|
||||
pytest
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
@@ -18,7 +18,7 @@ regex = "1"
|
||||
anyhow = "1.0"
|
||||
thiserror = "1"
|
||||
bytes = "1.0.1"
|
||||
nix = "0.23"
|
||||
nix = "0.20"
|
||||
url = "2.2.2"
|
||||
hex = { version = "0.4.3", features = ["serde"] }
|
||||
reqwest = { version = "0.11", features = ["blocking", "json"] }
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
# Page server and three safekeepers.
|
||||
[pageserver]
|
||||
pg_port = 64000
|
||||
http_port = 9898
|
||||
auth_type = 'Trust'
|
||||
|
||||
[[safekeepers]]
|
||||
name = 'sk1'
|
||||
pg_port = 5454
|
||||
http_port = 7676
|
||||
|
||||
[[safekeepers]]
|
||||
name = 'sk2'
|
||||
pg_port = 5455
|
||||
http_port = 7677
|
||||
|
||||
[[safekeepers]]
|
||||
name = 'sk3'
|
||||
pg_port = 5456
|
||||
http_port = 7678
|
||||
@@ -1,11 +0,0 @@
|
||||
# Minimal zenith environment with one safekeeper. This is equivalent to the built-in
|
||||
# defaults that you get with no --config
|
||||
[pageserver]
|
||||
pg_port = 64000
|
||||
http_port = 9898
|
||||
auth_type = 'Trust'
|
||||
|
||||
[[safekeepers]]
|
||||
name = 'single'
|
||||
pg_port = 5454
|
||||
http_port = 7676
|
||||
@@ -39,6 +39,8 @@ impl ComputeControlPlane {
|
||||
// | |- <tenant_id>
|
||||
// | | |- <branch name>
|
||||
pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
|
||||
// TODO: since pageserver do not have config file yet we believe here that
|
||||
// it is running on default port. Change that when pageserver will have config.
|
||||
let pageserver = Arc::new(PageServerNode::from_env(&env));
|
||||
|
||||
let mut nodes = BTreeMap::default();
|
||||
@@ -73,59 +75,40 @@ impl ComputeControlPlane {
|
||||
.unwrap_or(self.base_port)
|
||||
}
|
||||
|
||||
// FIXME: see also parse_point_in_time in branches.rs.
|
||||
fn parse_point_in_time(
|
||||
&self,
|
||||
tenantid: ZTenantId,
|
||||
s: &str,
|
||||
) -> Result<(ZTimelineId, Option<Lsn>)> {
|
||||
let mut strings = s.split('@');
|
||||
let name = strings.next().unwrap();
|
||||
|
||||
let lsn: Option<Lsn>;
|
||||
if let Some(lsnstr) = strings.next() {
|
||||
lsn = Some(
|
||||
Lsn::from_str(lsnstr)
|
||||
.with_context(|| "invalid LSN in point-in-time specification")?,
|
||||
);
|
||||
} else {
|
||||
lsn = None
|
||||
pub fn local(local_env: &LocalEnv, pageserver: &Arc<PageServerNode>) -> ComputeControlPlane {
|
||||
ComputeControlPlane {
|
||||
base_port: 65431,
|
||||
pageserver: Arc::clone(pageserver),
|
||||
nodes: BTreeMap::new(),
|
||||
env: local_env.clone(),
|
||||
}
|
||||
|
||||
// Resolve the timeline ID, given the human-readable branch name
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(&tenantid, name)?
|
||||
.timeline_id;
|
||||
|
||||
Ok((timeline_id, lsn))
|
||||
}
|
||||
|
||||
pub fn new_node(
|
||||
&mut self,
|
||||
tenantid: ZTenantId,
|
||||
name: &str,
|
||||
timeline_spec: &str,
|
||||
branch_name: &str,
|
||||
port: Option<u16>,
|
||||
) -> Result<Arc<PostgresNode>> {
|
||||
// Resolve the human-readable timeline spec into timeline ID and LSN
|
||||
let (timelineid, lsn) = self.parse_point_in_time(tenantid, timeline_spec)?;
|
||||
let timeline_id = self
|
||||
.pageserver
|
||||
.branch_get_by_name(&tenantid, branch_name)?
|
||||
.timeline_id;
|
||||
|
||||
let port = port.unwrap_or_else(|| self.get_port());
|
||||
let node = Arc::new(PostgresNode {
|
||||
name: name.to_owned(),
|
||||
name: branch_name.to_owned(),
|
||||
address: SocketAddr::new("127.0.0.1".parse().unwrap(), port),
|
||||
env: self.env.clone(),
|
||||
pageserver: Arc::clone(&self.pageserver),
|
||||
is_test: false,
|
||||
timelineid,
|
||||
lsn,
|
||||
timelineid: timeline_id,
|
||||
tenantid,
|
||||
uses_wal_proposer: false,
|
||||
});
|
||||
|
||||
node.create_pgdata()?;
|
||||
node.setup_pg_conf(self.env.pageserver.auth_type)?;
|
||||
node.setup_pg_conf(self.env.auth_type)?;
|
||||
|
||||
self.nodes
|
||||
.insert((tenantid, node.name.clone()), Arc::clone(&node));
|
||||
@@ -144,7 +127,6 @@ pub struct PostgresNode {
|
||||
pageserver: Arc<PageServerNode>,
|
||||
is_test: bool,
|
||||
pub timelineid: ZTimelineId,
|
||||
pub lsn: Option<Lsn>, // if it's a read-only node. None for primary
|
||||
pub tenantid: ZTenantId,
|
||||
uses_wal_proposer: bool,
|
||||
}
|
||||
@@ -179,11 +161,8 @@ impl PostgresNode {
|
||||
let port: u16 = conf.parse_field("port", &context)?;
|
||||
let timelineid: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
|
||||
let tenantid: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
|
||||
let uses_wal_proposer = conf.get("wal_acceptors").is_some();
|
||||
|
||||
// parse recovery_target_lsn, if any
|
||||
let recovery_target_lsn: Option<Lsn> =
|
||||
conf.parse_field_optional("recovery_target_lsn", &context)?;
|
||||
let uses_wal_proposer = conf.get("wal_acceptors").is_some();
|
||||
|
||||
// ok now
|
||||
Ok(PostgresNode {
|
||||
@@ -193,13 +172,12 @@ impl PostgresNode {
|
||||
pageserver: Arc::clone(pageserver),
|
||||
is_test: false,
|
||||
timelineid,
|
||||
lsn: recovery_target_lsn,
|
||||
tenantid,
|
||||
uses_wal_proposer,
|
||||
})
|
||||
}
|
||||
|
||||
fn sync_safekeepers(&self) -> Result<Lsn> {
|
||||
fn sync_walkeepers(&self) -> Result<Lsn> {
|
||||
let pg_path = self.env.pg_bin_dir().join("postgres");
|
||||
let sync_handle = Command::new(pg_path)
|
||||
.arg("--sync-safekeepers")
|
||||
@@ -224,7 +202,7 @@ impl PostgresNode {
|
||||
}
|
||||
|
||||
let lsn = Lsn::from_str(std::str::from_utf8(&sync_output.stdout)?.trim())?;
|
||||
println!("Safekeepers synced on {}", lsn);
|
||||
println!("Walkeepers synced on {}", lsn);
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
@@ -255,7 +233,7 @@ impl PostgresNode {
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
tar::Archive::new(copyreader)
|
||||
.unpack(&self.pgdata())
|
||||
.with_context(|| "extracting base backup failed")?;
|
||||
.with_context(|| "extracting page backup failed")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -287,15 +265,10 @@ impl PostgresNode {
|
||||
conf.append("max_replication_slots", "10");
|
||||
conf.append("hot_standby", "on");
|
||||
conf.append("shared_buffers", "1MB");
|
||||
conf.append("max_wal_size", "100GB");
|
||||
conf.append("fsync", "off");
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_sender_timeout", "0");
|
||||
conf.append("wal_level", "replica");
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
//conf.append("wal_sender_timeout", "5s");
|
||||
//conf.append("max_replication_flush_lag", "160MB");
|
||||
//conf.append("max_replication_apply_lag", "1500MB");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
|
||||
@@ -328,30 +301,11 @@ impl PostgresNode {
|
||||
conf.append("zenith.page_server_connstring", &pageserver_connstr);
|
||||
conf.append("zenith.zenith_tenant", &self.tenantid.to_string());
|
||||
conf.append("zenith.zenith_timeline", &self.timelineid.to_string());
|
||||
if let Some(lsn) = self.lsn {
|
||||
conf.append("recovery_target_lsn", &lsn.to_string());
|
||||
}
|
||||
conf.append_line("");
|
||||
|
||||
if !self.env.safekeepers.is_empty() {
|
||||
// Configure the node to connect to the safekeepers
|
||||
conf.append("synchronous_standby_names", "walproposer");
|
||||
|
||||
let wal_acceptors = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("wal_acceptors", &wal_acceptors);
|
||||
} else {
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
// This isn't really a supported configuration, but can be useful for
|
||||
// testing.
|
||||
conf.append("synchronous_standby_names", "pageserver");
|
||||
conf.append("zenith.callmemaybe_connstring", &self.connstr());
|
||||
}
|
||||
// Configure the node to stream WAL directly to the pageserver
|
||||
conf.append("synchronous_standby_names", "pageserver"); // TODO: add a new function arg?
|
||||
conf.append("zenith.callmemaybe_connstring", &self.connstr());
|
||||
|
||||
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
|
||||
file.write_all(conf.to_string().as_bytes())?;
|
||||
@@ -360,14 +314,12 @@ impl PostgresNode {
|
||||
}
|
||||
|
||||
fn load_basebackup(&self) -> Result<()> {
|
||||
let backup_lsn = if let Some(lsn) = self.lsn {
|
||||
Some(lsn)
|
||||
} else if self.uses_wal_proposer {
|
||||
let lsn = if self.uses_wal_proposer {
|
||||
// LSN 0 means that it is bootstrap and we need to download just
|
||||
// latest data from the pageserver. That is a bit clumsy but whole bootstrap
|
||||
// procedure evolves quite actively right now, so let's think about it again
|
||||
// when things would be more stable (TODO).
|
||||
let lsn = self.sync_safekeepers()?;
|
||||
let lsn = self.sync_walkeepers()?;
|
||||
if lsn == Lsn(0) {
|
||||
None
|
||||
} else {
|
||||
@@ -377,7 +329,7 @@ impl PostgresNode {
|
||||
None
|
||||
};
|
||||
|
||||
self.do_basebackup(backup_lsn)?;
|
||||
self.do_basebackup(lsn)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -454,10 +406,6 @@ impl PostgresNode {
|
||||
// 3. Load basebackup
|
||||
self.load_basebackup()?;
|
||||
|
||||
if self.lsn.is_some() {
|
||||
File::create(self.pgdata().join("standby.signal"))?;
|
||||
}
|
||||
|
||||
// 4. Finally start the compute node postgres
|
||||
println!("Starting postgres node at '{}'", self.connstr());
|
||||
self.pg_ctl(&["start"], auth_token)
|
||||
@@ -504,7 +452,9 @@ impl PostgresNode {
|
||||
.output()
|
||||
.expect("failed to execute whoami");
|
||||
|
||||
assert!(output.status.success(), "whoami failed");
|
||||
if !output.status.success() {
|
||||
panic!("whoami failed");
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim().to_string()
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@ use std::path::Path;
|
||||
pub mod compute;
|
||||
pub mod local_env;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
pub mod storage;
|
||||
|
||||
/// Read a PID file
|
||||
|
||||
@@ -7,102 +7,46 @@
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::env;
|
||||
use std::fmt::Write;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::path::PathBuf;
|
||||
use std::process::{Command, Stdio};
|
||||
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
|
||||
use zenith_utils::auth::{encode_from_key_path, Claims, Scope};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
//
|
||||
// This data structures represents zenith CLI config
|
||||
//
|
||||
// It is deserialized from the .zenith/config file, or the config file passed
|
||||
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
|
||||
// an example.
|
||||
// This data structures represent deserialized zenith CLI config
|
||||
//
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
pub struct LocalEnv {
|
||||
// Base directory for all the nodes (the pageserver, safekeepers and
|
||||
// compute nodes).
|
||||
//
|
||||
// This is not stored in the config file. Rather, this is the path where the
|
||||
// config file itself is. It is read from the ZENITH_REPO_DIR env variable or
|
||||
// '.zenith' if not given.
|
||||
#[serde(skip)]
|
||||
// Pageserver connection settings
|
||||
pub pageserver_pg_port: u16,
|
||||
pub pageserver_http_port: u16,
|
||||
|
||||
// Base directory for both pageserver and compute nodes
|
||||
pub base_data_dir: PathBuf,
|
||||
|
||||
// Path to postgres distribution. It's expected that "bin", "include",
|
||||
// "lib", "share" from postgres distribution are there. If at some point
|
||||
// in time we will be able to run against vanilla postgres we may split that
|
||||
// to four separate paths and match OS-specific installation layout.
|
||||
#[serde(default)]
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
|
||||
// Path to pageserver binary.
|
||||
#[serde(default)]
|
||||
pub zenith_distrib_dir: PathBuf,
|
||||
|
||||
// Default tenant ID to use with the 'zenith' command line utility, when
|
||||
// --tenantid is not explicitly specified.
|
||||
#[serde(with = "opt_tenantid_serde")]
|
||||
#[serde(default)]
|
||||
pub default_tenantid: Option<ZTenantId>,
|
||||
// keeping tenant id in config to reduce copy paste when running zenith locally with single tenant
|
||||
#[serde(with = "hex")]
|
||||
pub tenantid: ZTenantId,
|
||||
|
||||
// used to issue tokens during e.g pg start
|
||||
#[serde(default)]
|
||||
pub private_key_path: PathBuf,
|
||||
|
||||
pub pageserver: PageServerConf,
|
||||
|
||||
#[serde(default)]
|
||||
pub safekeepers: Vec<SafekeeperConf>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct PageServerConf {
|
||||
// Pageserver connection settings
|
||||
pub pg_port: u16,
|
||||
pub http_port: u16,
|
||||
// jwt auth token used for communication with pageserver
|
||||
pub auth_token: String,
|
||||
|
||||
// used to determine which auth type is used
|
||||
pub auth_type: AuthType,
|
||||
|
||||
// jwt auth token used for communication with pageserver
|
||||
pub auth_token: String,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
pg_port: 0,
|
||||
http_port: 0,
|
||||
auth_type: AuthType::Trust,
|
||||
auth_token: "".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct SafekeeperConf {
|
||||
pub name: String,
|
||||
pub pg_port: u16,
|
||||
pub http_port: u16,
|
||||
pub sync: bool,
|
||||
}
|
||||
|
||||
impl Default for SafekeeperConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
name: "".to_string(),
|
||||
pg_port: 0,
|
||||
http_port: 0,
|
||||
sync: true,
|
||||
}
|
||||
}
|
||||
// used to issue tokens during e.g pg start
|
||||
pub private_key_path: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalEnv {
|
||||
@@ -118,10 +62,6 @@ impl LocalEnv {
|
||||
Ok(self.zenith_distrib_dir.join("pageserver"))
|
||||
}
|
||||
|
||||
pub fn safekeeper_bin(&self) -> Result<PathBuf> {
|
||||
Ok(self.zenith_distrib_dir.join("safekeeper"))
|
||||
}
|
||||
|
||||
pub fn pg_data_dirs_path(&self) -> PathBuf {
|
||||
self.base_data_dir.join("pgdatadirs").join("tenants")
|
||||
}
|
||||
@@ -136,187 +76,6 @@ impl LocalEnv {
|
||||
pub fn pageserver_data_dir(&self) -> PathBuf {
|
||||
self.base_data_dir.clone()
|
||||
}
|
||||
|
||||
pub fn safekeeper_data_dir(&self, node_name: &str) -> PathBuf {
|
||||
self.base_data_dir.join("safekeepers").join(node_name)
|
||||
}
|
||||
|
||||
/// Create a LocalEnv from a config file.
|
||||
///
|
||||
/// Unlike 'load_config', this function fills in any defaults that are missing
|
||||
/// from the config file.
|
||||
pub fn create_config(toml: &str) -> Result<LocalEnv> {
|
||||
let mut env: LocalEnv = toml::from_str(toml)?;
|
||||
|
||||
// Find postgres binaries.
|
||||
// Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
|
||||
if env.pg_distrib_dir == Path::new("") {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
env.pg_distrib_dir = postgres_bin.into();
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
env.pg_distrib_dir = cwd.join("tmp_install")
|
||||
}
|
||||
}
|
||||
if !env.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
// Find zenith binaries.
|
||||
if env.zenith_distrib_dir == Path::new("") {
|
||||
env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
}
|
||||
if !env.zenith_distrib_dir.join("pageserver").exists() {
|
||||
anyhow::bail!("Can't find pageserver binary.");
|
||||
}
|
||||
if !env.zenith_distrib_dir.join("safekeeper").exists() {
|
||||
anyhow::bail!("Can't find safekeeper binary.");
|
||||
}
|
||||
|
||||
// If no initial tenant ID was given, generate it.
|
||||
if env.default_tenantid.is_none() {
|
||||
env.default_tenantid = Some(ZTenantId::generate());
|
||||
}
|
||||
|
||||
env.base_data_dir = base_path();
|
||||
|
||||
Ok(env)
|
||||
}
|
||||
|
||||
/// Locate and load config
|
||||
pub fn load_config() -> Result<LocalEnv> {
|
||||
let repopath = base_path();
|
||||
|
||||
if !repopath.exists() {
|
||||
anyhow::bail!(
|
||||
"Zenith config is not found in {}. You need to run 'zenith init' first",
|
||||
repopath.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: check that it looks like a zenith repository
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
let mut env: LocalEnv = toml::from_str(config.as_str())?;
|
||||
|
||||
env.base_data_dir = repopath;
|
||||
|
||||
Ok(env)
|
||||
}
|
||||
|
||||
// this function is used only for testing purposes in CLI e g generate tokens during init
|
||||
pub fn generate_auth_token(&self, claims: &Claims) -> Result<String> {
|
||||
let private_key_path = if self.private_key_path.is_absolute() {
|
||||
self.private_key_path.to_path_buf()
|
||||
} else {
|
||||
self.base_data_dir.join(&self.private_key_path)
|
||||
};
|
||||
|
||||
let key_data = fs::read(private_key_path)?;
|
||||
encode_from_key_file(claims, &key_data)
|
||||
}
|
||||
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init(&mut self) -> Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = &self.base_data_dir;
|
||||
if base_path == Path::new("") {
|
||||
anyhow::bail!("repository base path is missing");
|
||||
}
|
||||
if base_path.exists() {
|
||||
anyhow::bail!(
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
fs::create_dir(&base_path)?;
|
||||
|
||||
// generate keys for jwt
|
||||
// openssl genrsa -out private_key.pem 2048
|
||||
let private_key_path;
|
||||
if self.private_key_path == PathBuf::new() {
|
||||
private_key_path = base_path.join("auth_private_key.pem");
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("genrsa")
|
||||
.args(&["-out", private_key_path.to_str().unwrap()])
|
||||
.arg("2048")
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.with_context(|| "failed to generate auth private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
self.private_key_path = Path::new("auth_private_key.pem").to_path_buf();
|
||||
|
||||
let public_key_path = base_path.join("auth_public_key.pem");
|
||||
// openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("rsa")
|
||||
.args(&["-in", private_key_path.to_str().unwrap()])
|
||||
.arg("-pubout")
|
||||
.args(&["-outform", "PEM"])
|
||||
.args(&["-out", public_key_path.to_str().unwrap()])
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.with_context(|| "failed to generate auth private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
self.pageserver.auth_token =
|
||||
self.generate_auth_token(&Claims::new(None, Scope::PageServerApi))?;
|
||||
|
||||
fs::create_dir_all(self.pg_data_dirs_path())?;
|
||||
|
||||
for safekeeper in self.safekeepers.iter() {
|
||||
fs::create_dir_all(self.safekeeper_data_dir(&safekeeper.name))?;
|
||||
}
|
||||
|
||||
let mut conf_content = String::new();
|
||||
|
||||
// Currently, the user first passes a config file with 'zenith init --config=<path>'
|
||||
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
|
||||
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
|
||||
// a bit sad.
|
||||
write!(
|
||||
&mut conf_content,
|
||||
r#"# This file describes a locale deployment of the page server
|
||||
# and safekeeeper node. It is read by the 'zenith' command-line
|
||||
# utility.
|
||||
"#
|
||||
)?;
|
||||
|
||||
// Convert the LocalEnv to a toml file.
|
||||
//
|
||||
// This could be as simple as this:
|
||||
//
|
||||
// conf_content += &toml::to_string_pretty(env)?;
|
||||
//
|
||||
// But it results in a "values must be emitted before tables". I'm not sure
|
||||
// why, AFAICS the table, i.e. 'safekeepers: Vec<SafekeeperConf>' is last.
|
||||
// Maybe rust reorders the fields to squeeze avoid padding or something?
|
||||
// In any case, converting to toml::Value first, and serializing that, works.
|
||||
// See https://github.com/alexcrichton/toml-rs/issues/142
|
||||
conf_content += &toml::to_string_pretty(&toml::Value::try_from(&self)?)?;
|
||||
|
||||
fs::write(base_path.join("config"), conf_content)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
@@ -326,29 +85,118 @@ fn base_path() -> PathBuf {
|
||||
}
|
||||
}
|
||||
|
||||
/// Serde routines for Option<ZTenantId>. The serialized form is a hex string.
|
||||
mod opt_tenantid_serde {
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
pub fn serialize<S>(tenantid: &Option<ZTenantId>, ser: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
tenantid.map(|t| t.to_string()).serialize(ser)
|
||||
//
|
||||
// Initialize a new Zenith repository
|
||||
//
|
||||
pub fn init(
|
||||
pageserver_pg_port: u16,
|
||||
pageserver_http_port: u16,
|
||||
tenantid: ZTenantId,
|
||||
auth_type: AuthType,
|
||||
) -> Result<()> {
|
||||
// check if config already exists
|
||||
let base_path = base_path();
|
||||
if base_path.exists() {
|
||||
anyhow::bail!(
|
||||
"{} already exists. Perhaps already initialized?",
|
||||
base_path.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
fs::create_dir(&base_path)?;
|
||||
|
||||
pub fn deserialize<'de, D>(des: D) -> Result<Option<ZTenantId>, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
{
|
||||
let s: Option<String> = Option::deserialize(des)?;
|
||||
if let Some(s) = s {
|
||||
return Ok(Some(
|
||||
ZTenantId::from_str(&s).map_err(serde::de::Error::custom)?,
|
||||
));
|
||||
// ok, now check that expected binaries are present
|
||||
|
||||
// Find postgres binaries. Follow POSTGRES_DISTRIB_DIR if set, otherwise look in "tmp_install".
|
||||
let pg_distrib_dir: PathBuf = {
|
||||
if let Some(postgres_bin) = env::var_os("POSTGRES_DISTRIB_DIR") {
|
||||
postgres_bin.into()
|
||||
} else {
|
||||
let cwd = env::current_dir()?;
|
||||
cwd.join("tmp_install")
|
||||
}
|
||||
Ok(None)
|
||||
};
|
||||
if !pg_distrib_dir.join("bin/postgres").exists() {
|
||||
anyhow::bail!("Can't find postgres binary at {:?}", pg_distrib_dir);
|
||||
}
|
||||
|
||||
// generate keys for jwt
|
||||
// openssl genrsa -out private_key.pem 2048
|
||||
let private_key_path = base_path.join("auth_private_key.pem");
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("genrsa")
|
||||
.args(&["-out", private_key_path.to_str().unwrap()])
|
||||
.arg("2048")
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.with_context(|| "failed to generate auth private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
let public_key_path = base_path.join("auth_public_key.pem");
|
||||
// openssl rsa -in private_key.pem -pubout -outform PEM -out public_key.pem
|
||||
let keygen_output = Command::new("openssl")
|
||||
.arg("rsa")
|
||||
.args(&["-in", private_key_path.to_str().unwrap()])
|
||||
.arg("-pubout")
|
||||
.args(&["-outform", "PEM"])
|
||||
.args(&["-out", public_key_path.to_str().unwrap()])
|
||||
.stdout(Stdio::null())
|
||||
.output()
|
||||
.with_context(|| "failed to generate auth private key")?;
|
||||
if !keygen_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"openssl failed: '{}'",
|
||||
String::from_utf8_lossy(&keygen_output.stderr)
|
||||
);
|
||||
}
|
||||
|
||||
let auth_token =
|
||||
encode_from_key_path(&Claims::new(None, Scope::PageServerApi), &private_key_path)?;
|
||||
|
||||
// Find zenith binaries.
|
||||
let zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
if !zenith_distrib_dir.join("pageserver").exists() {
|
||||
anyhow::bail!("Can't find pageserver binary.",);
|
||||
}
|
||||
|
||||
let conf = LocalEnv {
|
||||
pageserver_pg_port,
|
||||
pageserver_http_port,
|
||||
pg_distrib_dir,
|
||||
zenith_distrib_dir,
|
||||
base_data_dir: base_path,
|
||||
tenantid,
|
||||
auth_token,
|
||||
auth_type,
|
||||
private_key_path,
|
||||
};
|
||||
|
||||
fs::create_dir_all(conf.pg_data_dirs_path())?;
|
||||
|
||||
let toml = toml::to_string_pretty(&conf)?;
|
||||
fs::write(conf.base_data_dir.join("config"), toml)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Locate and load config
|
||||
pub fn load_config() -> Result<LocalEnv> {
|
||||
let repopath = base_path();
|
||||
|
||||
if !repopath.exists() {
|
||||
anyhow::bail!(
|
||||
"Zenith config is not found in {}. You need to run 'zenith init' first",
|
||||
repopath.to_str().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
// TODO: check that it looks like a zenith repository
|
||||
|
||||
// load and parse file
|
||||
let config = fs::read_to_string(repopath.join("config"))?;
|
||||
toml::from_str(config.as_str()).map_err(|e| e.into())
|
||||
}
|
||||
|
||||
@@ -83,22 +83,6 @@ impl PostgresConf {
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))
|
||||
}
|
||||
|
||||
pub fn parse_field_optional<T>(&self, field_name: &str, context: &str) -> Result<Option<T>>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
if let Some(val) = self.get(field_name) {
|
||||
let result = val
|
||||
.parse::<T>()
|
||||
.with_context(|| format!("could not parse '{}' option {}", field_name, context))?;
|
||||
|
||||
Ok(Some(result))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Note: if you call this multiple times for the same option, the config
|
||||
/// file will a line for each call. It would be nice to have a function
|
||||
|
||||
@@ -1,281 +0,0 @@
|
||||
use std::io::Write;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
|
||||
use anyhow::bail;
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::read_pidfile;
|
||||
use crate::storage::PageServerNode;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
#[error("Reqwest error: {0}")]
|
||||
Transport(#[from] reqwest::Error),
|
||||
|
||||
#[error("Error: {0}")]
|
||||
Response(String),
|
||||
}
|
||||
|
||||
type Result<T> = result::Result<T, SafekeeperHttpError>;
|
||||
|
||||
pub trait ResponseErrorMessageExt: Sized {
|
||||
fn error_from_body(self) -> Result<Self>;
|
||||
}
|
||||
|
||||
impl ResponseErrorMessageExt for Response {
|
||||
fn error_from_body(self) -> Result<Self> {
|
||||
let status = self.status();
|
||||
if !(status.is_client_error() || status.is_server_error()) {
|
||||
return Ok(self);
|
||||
}
|
||||
|
||||
// reqwest do not export it's error construction utility functions, so lets craft the message ourselves
|
||||
let url = self.url().to_owned();
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Control routines for safekeeper.
|
||||
//
|
||||
// Used in CLI and tests.
|
||||
//
|
||||
#[derive(Debug)]
|
||||
pub struct SafekeeperNode {
|
||||
pub name: String,
|
||||
|
||||
pub conf: SafekeeperConf,
|
||||
|
||||
pub pg_connection_config: Config,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
pub http_base_url: String,
|
||||
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let pageserver = Arc::new(PageServerNode::from_env(env));
|
||||
|
||||
println!("initializing for {} for {}", conf.name, conf.http_port);
|
||||
|
||||
SafekeeperNode {
|
||||
name: conf.name.clone(),
|
||||
conf: conf.clone(),
|
||||
pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
|
||||
env: env.clone(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://localhost:{}/v1", conf.http_port),
|
||||
pageserver,
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to this safekeeper.
|
||||
fn safekeeper_connection_config(port: u16) -> Config {
|
||||
// TODO safekeeper authentication not implemented yet
|
||||
format!("postgresql://no_user@localhost:{}/no_db", port)
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn datadir_path(&self) -> PathBuf {
|
||||
self.env.safekeeper_data_dir(&self.name)
|
||||
}
|
||||
|
||||
pub fn pid_file(&self) -> PathBuf {
|
||||
self.datadir_path().join("safekeeper.pid")
|
||||
}
|
||||
|
||||
pub fn start(&self) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
connection_address(&self.pg_connection_config),
|
||||
self.datadir_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
// Configure connection to page server
|
||||
//
|
||||
// FIXME: We extract the host and port from the connection string instead of using
|
||||
// the connection string directly, because the 'safekeeper' binary expects
|
||||
// host:port format. That's a bit silly when we already have a full libpq connection
|
||||
// string at hand.
|
||||
let pageserver_conn = {
|
||||
let (host, port) = connection_host_port(&self.pageserver.pg_connection_config);
|
||||
format!("{}:{}", host, port)
|
||||
};
|
||||
|
||||
let listen_pg = format!("localhost:{}", self.conf.pg_port);
|
||||
let listen_http = format!("localhost:{}", self.conf.http_port);
|
||||
|
||||
let mut cmd = Command::new(self.env.safekeeper_bin()?);
|
||||
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
|
||||
.args(&["--listen-pg", &listen_pg])
|
||||
.args(&["--listen-http", &listen_http])
|
||||
.args(&["--pageserver", &pageserver_conn])
|
||||
.args(&["--recall", "1 second"])
|
||||
.arg("--daemonize")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1");
|
||||
if !self.conf.sync {
|
||||
cmd.arg("--no-sync");
|
||||
}
|
||||
|
||||
if self.env.pageserver.auth_type == AuthType::ZenithJWT {
|
||||
cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
|
||||
}
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Safekeeper failed to start. See '{}' for details.",
|
||||
self.datadir_path().join("safekeeper.log").display()
|
||||
);
|
||||
}
|
||||
|
||||
// It takes a while for the safekeeper to start up. Wait until it is
|
||||
// open for business.
|
||||
const RETRIES: i8 = 15;
|
||||
for retries in 1..RETRIES {
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("\nSafekeeper started");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => {
|
||||
match err {
|
||||
SafekeeperHttpError::Transport(err) => {
|
||||
if err.is_connect() && retries < 5 {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
} else {
|
||||
if retries == 5 {
|
||||
println!() // put a line break after dots for second message
|
||||
}
|
||||
println!(
|
||||
"Safekeeper not responding yet, err {} retrying ({})...",
|
||||
err, retries
|
||||
);
|
||||
}
|
||||
}
|
||||
SafekeeperHttpError::Response(msg) => {
|
||||
bail!("safekeeper failed to start: {} ", msg)
|
||||
}
|
||||
}
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
bail!("safekeeper failed to start in {} seconds", RETRIES);
|
||||
}
|
||||
|
||||
///
|
||||
/// Stop the server.
|
||||
///
|
||||
/// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
|
||||
/// Otherwise we use SIGTERM, triggering a clean shutdown
|
||||
///
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Safekeeper {} is already stopped", self.name);
|
||||
return Ok(());
|
||||
}
|
||||
let pid = read_pidfile(&pid_file)?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
|
||||
let sig = if immediate {
|
||||
println!("Stop safekeeper immediately");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
println!("Stop safekeeper gracefully");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"Safekeeper with pid {} does not exist, but a PID file was found",
|
||||
pid
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to safekeeper with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
let address = connection_address(&self.pg_connection_config);
|
||||
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if safekeeper flushes a lot of data
|
||||
for _ in 0..100 {
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Safekeeper stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Safekeeper status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Safekeeper status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Safekeeper still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
}
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
// TODO: authentication
|
||||
//if self.env.auth_type == AuthType::ZenithJWT {
|
||||
// builder = builder.bearer_auth(&self.env.safekeeper_auth_token)
|
||||
//}
|
||||
self.http_client.request(method, url)
|
||||
}
|
||||
|
||||
pub fn check_status(&self) -> Result<()> {
|
||||
self.http_request(Method::GET, format!("{}/{}", self.http_base_url, "status"))
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -5,8 +5,7 @@ use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
|
||||
use anyhow::bail;
|
||||
use nix::errno::Errno;
|
||||
use anyhow::{anyhow, bail};
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver::http::models::{BranchCreateRequest, TenantCreateRequest};
|
||||
@@ -21,7 +20,6 @@ use zenith_utils::zid::ZTenantId;
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::read_pidfile;
|
||||
use pageserver::branches::BranchInfo;
|
||||
use pageserver::tenant_mgr::TenantInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
@@ -64,6 +62,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
//
|
||||
#[derive(Debug)]
|
||||
pub struct PageServerNode {
|
||||
pub kill_on_exit: bool,
|
||||
pub pg_connection_config: Config,
|
||||
pub env: LocalEnv,
|
||||
pub http_client: Client,
|
||||
@@ -72,33 +71,34 @@ pub struct PageServerNode {
|
||||
|
||||
impl PageServerNode {
|
||||
pub fn from_env(env: &LocalEnv) -> PageServerNode {
|
||||
let password = if env.pageserver.auth_type == AuthType::ZenithJWT {
|
||||
&env.pageserver.auth_token
|
||||
let password = if env.auth_type == AuthType::ZenithJWT {
|
||||
&env.auth_token
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
PageServerNode {
|
||||
kill_on_exit: false,
|
||||
pg_connection_config: Self::pageserver_connection_config(
|
||||
password,
|
||||
env.pageserver.pg_port,
|
||||
env.pageserver_pg_port,
|
||||
),
|
||||
env: env.clone(),
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://localhost:{}/v1", env.pageserver.http_port),
|
||||
http_base_url: format!("http://localhost:{}/v1", env.pageserver_http_port),
|
||||
}
|
||||
}
|
||||
|
||||
/// Construct libpq connection string for connecting to the pageserver.
|
||||
fn pageserver_connection_config(password: &str, port: u16) -> Config {
|
||||
format!("postgresql://no_user:{}@localhost:{}/no_db", password, port)
|
||||
.parse()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
|
||||
let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
|
||||
let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
|
||||
pub fn init(&self, create_tenant: Option<&str>, enable_auth: bool) -> anyhow::Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let listen_pg = format!("localhost:{}", self.env.pageserver_pg_port);
|
||||
let listen_http = format!("localhost:{}", self.env.pageserver_http_port);
|
||||
let mut args = vec![
|
||||
"--init",
|
||||
"-D",
|
||||
@@ -111,29 +111,27 @@ impl PageServerNode {
|
||||
&listen_http,
|
||||
];
|
||||
|
||||
let auth_type_str = &self.env.pageserver.auth_type.to_string();
|
||||
if self.env.pageserver.auth_type != AuthType::Trust {
|
||||
if enable_auth {
|
||||
args.extend(&["--auth-validation-public-key-path", "auth_public_key.pem"]);
|
||||
args.extend(&["--auth-type", "ZenithJWT"]);
|
||||
}
|
||||
args.extend(&["--auth-type", auth_type_str]);
|
||||
|
||||
if let Some(tenantid) = create_tenant {
|
||||
args.extend(&["--create-tenant", tenantid])
|
||||
}
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(args).env_clear().env("RUST_BACKTRACE", "1");
|
||||
let status = cmd
|
||||
.args(args)
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.status()
|
||||
.expect("pageserver init failed");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
if status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!("pageserver init failed"))
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!("pageserver init failed");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
@@ -154,15 +152,10 @@ impl PageServerNode {
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(&["-D", self.repo_path().to_str().unwrap()])
|
||||
.arg("--daemonize")
|
||||
.arg("-d")
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
@@ -206,69 +199,23 @@ impl PageServerNode {
|
||||
bail!("pageserver failed to start in {} seconds", RETRIES);
|
||||
}
|
||||
|
||||
///
|
||||
/// Stop the server.
|
||||
///
|
||||
/// If 'immediate' is true, we use SIGQUIT, killing the process immediately.
|
||||
/// Otherwise we use SIGTERM, triggering a clean shutdown
|
||||
///
|
||||
/// If the server is not running, returns success
|
||||
///
|
||||
pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
|
||||
let pid_file = self.pid_file();
|
||||
if !pid_file.exists() {
|
||||
println!("Pageserver is already stopped");
|
||||
return Ok(());
|
||||
pub fn stop(&self) -> anyhow::Result<()> {
|
||||
let pid = read_pidfile(&self.pid_file())?;
|
||||
let pid = Pid::from_raw(pid);
|
||||
if kill(pid, Signal::SIGTERM).is_err() {
|
||||
bail!("Failed to kill pageserver with pid {}", pid);
|
||||
}
|
||||
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
|
||||
|
||||
let sig = if immediate {
|
||||
println!("Stop pageserver immediately");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
println!("Stop pageserver gracefully");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
println!(
|
||||
"Pageserver with pid {} does not exist, but a PID file was found",
|
||||
pid
|
||||
);
|
||||
// wait for pageserver stop
|
||||
let address = connection_address(&self.pg_connection_config);
|
||||
for _ in 0..5 {
|
||||
let stream = TcpStream::connect(&address);
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
if let Err(_e) = stream {
|
||||
println!("Pageserver stopped");
|
||||
return Ok(());
|
||||
}
|
||||
Err(err) => bail!(
|
||||
"Failed to send signal to pageserver with pid {}: {}",
|
||||
pid,
|
||||
err.desc()
|
||||
),
|
||||
}
|
||||
|
||||
let address = connection_address(&self.pg_connection_config);
|
||||
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
for _ in 0..100 {
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Pageserver stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Ok(_) => {
|
||||
println!("Pageserver status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
println!("Pageserver status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Pageserver still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
println!("Stopping pageserver on {}", address);
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
@@ -287,8 +234,8 @@ impl PageServerNode {
|
||||
|
||||
fn http_request<U: IntoUrl>(&self, method: Method, url: U) -> RequestBuilder {
|
||||
let mut builder = self.http_client.request(method, url);
|
||||
if self.env.pageserver.auth_type == AuthType::ZenithJWT {
|
||||
builder = builder.bearer_auth(&self.env.pageserver.auth_token)
|
||||
if self.env.auth_type == AuthType::ZenithJWT {
|
||||
builder = builder.bearer_auth(&self.env.auth_token)
|
||||
}
|
||||
builder
|
||||
}
|
||||
@@ -300,7 +247,7 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn tenant_list(&self) -> Result<Vec<TenantInfo>> {
|
||||
pub fn tenant_list(&self) -> Result<Vec<String>> {
|
||||
Ok(self
|
||||
.http_request(Method::GET, format!("{}/{}", self.http_base_url, "tenant"))
|
||||
.send()?
|
||||
@@ -363,3 +310,11 @@ impl PageServerNode {
|
||||
.json()?)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageServerNode {
|
||||
fn drop(&mut self) {
|
||||
if self.kill_on_exit {
|
||||
let _ = self.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ if [ "$1" = 'pageserver' ]; then
|
||||
pageserver --init -D /data --postgres-distrib /usr/local
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -l 0.0.0.0:6400 --listen-http 0.0.0.0:9898 -D /data
|
||||
pageserver -l 0.0.0.0:6400 -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
@@ -10,5 +10,5 @@
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [walkeeper/README](/walkeeper/README.md) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `wal_acceptor` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
|
||||
|
||||
And two intermediate images used either to reduce build time or to deliver some additional binary tools from other repos:
|
||||
|
||||
@@ -51,14 +51,11 @@ Each PostgreSQL fork is considered a separate relish.
|
||||
|
||||
### Layer
|
||||
|
||||
A layer contains data needed to reconstruct any page versions within the
|
||||
layer's Segment and range of LSNs.
|
||||
|
||||
Each layer corresponds to the specific version of a relish Segment in a range of LSNs.
|
||||
There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
layers are used to ingest incoming WAL, and provide fast access
|
||||
to the recent page versions. On-disk layers are stored as files on disk, and
|
||||
are immutable. See pageserver/src/layered_repository/README.md for more.
|
||||
|
||||
are immutable.
|
||||
### Layer file (on-disk layer)
|
||||
|
||||
Layered repository on-disk format is based on immutable files. The
|
||||
|
||||
@@ -56,4 +56,4 @@ Tenant id is passed to postgres via GUC the same way as the timeline. Tenant id
|
||||
|
||||
### Safety
|
||||
|
||||
For now particular tenant can only appear on a particular pageserver. Set of safekeepers are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
For now particular tenant can only appear on a particular pageserver. Set of WAL acceptors are also pinned to particular (tenantid, timeline) pair so there can only be one writer for particular (tenantid, timeline).
|
||||
|
||||
128
docs/settings.md
128
docs/settings.md
@@ -1,128 +0,0 @@
|
||||
## Pageserver
|
||||
|
||||
### listen_pg_addr
|
||||
|
||||
Network interface and port number to listen at for connections from
|
||||
the compute nodes and safekeepers. The default is `127.0.0.1:64000`.
|
||||
|
||||
### listen_http_addr
|
||||
|
||||
Network interface and port number to listen at for admin connections.
|
||||
The default is `127.0.0.1:9898`.
|
||||
|
||||
### checkpoint_distance
|
||||
|
||||
`checkpoint_distance` is the amount of incoming WAL that is held in
|
||||
the open layer, before it's flushed to local disk. It puts an upper
|
||||
bound on how much WAL needs to be re-processed after a pageserver
|
||||
crash. It is a soft limit, the pageserver can momentarily go above it,
|
||||
but it will trigger a checkpoint operation to get it back below the
|
||||
limit.
|
||||
|
||||
`checkpoint_distance` also determines how much WAL needs to be kept
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
this much WAL, with some headroom, otherwise you can get stuck in a
|
||||
situation where the safekeeper is full and stops accepting new WAL,
|
||||
but the pageserver is not flushing out and releasing the space in the
|
||||
safekeeper because it hasn't reached checkpoint_distance yet.
|
||||
|
||||
`checkpoint_distance` also controls how often the WAL is uploaded to
|
||||
S3.
|
||||
|
||||
The unit is # of bytes.
|
||||
|
||||
### checkpoint_period
|
||||
|
||||
The pageserver checks whether `checkpoint_distance` has been reached
|
||||
every `checkpoint_period` seconds. Default is 1 s, which should be
|
||||
fine.
|
||||
|
||||
### gc_horizon
|
||||
|
||||
`gz_horizon` determines how much history is retained, to allow
|
||||
branching and read replicas at an older point in time. The unit is #
|
||||
of bytes of WAL. Page versions older than this are garbage collected
|
||||
away.
|
||||
|
||||
### gc_period
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
|
||||
### superuser
|
||||
|
||||
Name of the initial superuser role, passed to initdb when a new tenant
|
||||
is initialized. It doesn't affect anything after initialization. The
|
||||
default is Note: The default is 'zenith_admin', and the console
|
||||
depends on that, so if you change it, bad things will happen.
|
||||
|
||||
### page_cache_size
|
||||
|
||||
Size of the page cache, to hold materialized page versions. Unit is
|
||||
number of 8 kB blocks. The default is 8192, which means 64 MB.
|
||||
|
||||
### max_file_descriptors
|
||||
|
||||
Max number of file descriptors to hold open concurrently for accessing
|
||||
layer files. This should be kept well below the process/container/OS
|
||||
limit (see `ulimit -n`), as the pageserver also needs file descriptors
|
||||
for other files and for sockets for incoming connections.
|
||||
|
||||
### postgres-distrib
|
||||
|
||||
A directory with Postgres installation to use during pageserver activities.
|
||||
Inside that dir, a `bin/postgres` binary should be present.
|
||||
|
||||
The default distrib dir is `./tmp_install/`.
|
||||
|
||||
### workdir (-D)
|
||||
|
||||
A directory in the file system, where pageserver will store its files.
|
||||
The default is `./.zenith/`.
|
||||
|
||||
### Remote storage
|
||||
|
||||
There's a way to automatically backup and restore some of the pageserver's data from working dir to the remote storage.
|
||||
The backup system is disabled by default and can be enabled for either of the currently available storages:
|
||||
|
||||
#### Local FS storage
|
||||
|
||||
##### remote-storage-local-path
|
||||
|
||||
Pageserver can back up and restore some of its workdir contents to another directory.
|
||||
For that, only a path to that directory needs to be specified as a parameter.
|
||||
|
||||
#### S3 storage
|
||||
|
||||
Pageserver can back up and restore some of its workdir contents to S3.
|
||||
Full set of S3 credentials is needed for that as parameters:
|
||||
|
||||
##### remote-storage-s3-bucket
|
||||
|
||||
Name of the bucket to connect to, example: "some-sample-bucket".
|
||||
|
||||
##### remote-storage-region
|
||||
|
||||
Name of the region where the bucket is located at, example: "eu-north-1"
|
||||
|
||||
##### remote-storage-access-key
|
||||
|
||||
Access key to connect to the bucket ("login" part of the credentials), example: "AKIAIOSFODNN7EXAMPLE"
|
||||
|
||||
##### remote-storage-secret-access-key
|
||||
|
||||
Secret access key to connect to the bucket ("password" part of the credentials), example: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
|
||||
|
||||
#### General remote storage configuration
|
||||
|
||||
Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
|
||||
No default values are used for the remote storage configuration parameters.
|
||||
|
||||
##### remote-storage-max-concurrent-sync
|
||||
|
||||
Max number of concurrent connections to open for uploading to or
|
||||
downloading from S3.
|
||||
The default value is 100.
|
||||
|
||||
## safekeeper
|
||||
|
||||
TODO
|
||||
@@ -79,61 +79,3 @@ Helpers for exposing Prometheus metrics from the server.
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
|
||||
A single virtual environment with all dependencies is described in the single `Pipfile`.
|
||||
|
||||
### Prerequisites
|
||||
- Install Python 3.7 (the minimal supported version)
|
||||
- Later version (e.g. 3.8) is ok if you don't write Python code
|
||||
- You can install Python 3.7 separately, e.g.:
|
||||
```bash
|
||||
# In Ubuntu
|
||||
sudo add-apt-repository ppa:deadsnakes/ppa
|
||||
sudo apt update
|
||||
sudo apt install python3.7
|
||||
```
|
||||
- Install `pipenv`
|
||||
- Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`.
|
||||
- Install dependencies via either
|
||||
* `pipenv --python 3.7 install --dev` if you will write Python code, or
|
||||
* `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
|
||||
|
||||
Run `pipenv shell` to activate the virtual environment.
|
||||
Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`.
|
||||
|
||||
### Obligatory checks
|
||||
We force code formatting via `yapf` and type hints via `mypy`.
|
||||
Run the following commands in the repository's root (next to `setup.cfg`):
|
||||
|
||||
```bash
|
||||
pipenv run yapf -ri . # All code is reformatted
|
||||
pipenv run mypy . # Ensure there are no typing errors
|
||||
```
|
||||
|
||||
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
||||
Otherwise it will not find its configuration.
|
||||
|
||||
Also consider:
|
||||
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Adding more type hints to your code to avoid `Any`.
|
||||
|
||||
### Changing dependencies
|
||||
You have to update `Pipfile.lock` if you have changed `Pipfile`:
|
||||
|
||||
```bash
|
||||
pipenv --python 3.7 install --dev # Re-create venv for Python 3.7 and install recent pipenv inside
|
||||
pipenv run pipenv --version # Should be at least 2021.5.29
|
||||
pipenv run pipenv lock # Regenerate Pipfile.lock
|
||||
```
|
||||
|
||||
As the minimal supported version is Python 3.7 and we use it in CI,
|
||||
you have to use a Python 3.7 environment when updating `Pipfile.lock`.
|
||||
Otherwise some back-compatibility packages will be missing.
|
||||
|
||||
It is also important to run recent `pipenv`.
|
||||
Older versions remove markers from `Pipfile.lock`.
|
||||
|
||||
@@ -5,7 +5,7 @@ authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
|
||||
bookfile = "^0.3"
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
@@ -32,21 +32,11 @@ serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
toml = "0.5"
|
||||
scopeguard = "1.1.0"
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
async-trait = "0.1"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
signal-hook = "0.3.10"
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
once_cell = "1.8.0"
|
||||
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
hex-literal = "0.3"
|
||||
tempfile = "3.2"
|
||||
|
||||
@@ -7,9 +7,8 @@ The Page Server has a few different duties:
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
- Backup to S3
|
||||
|
||||
S3 is the main fault-tolerant storage of all data, as there are no Page Server
|
||||
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
|
||||
keeps track of WAL records which are not syncted to S3 yet.
|
||||
|
||||
|
||||
|
||||
The Page Server consists of multiple threads that operate on a shared
|
||||
repository of page versions:
|
||||
@@ -41,7 +40,7 @@ Legend:
|
||||
+--+
|
||||
|
||||
....
|
||||
. . Component at its early development phase.
|
||||
. . Component that we will need, but doesn't exist at the moment. A TODO.
|
||||
....
|
||||
|
||||
---> Data flow
|
||||
@@ -116,49 +115,13 @@ Remove old on-disk layer files that are no longer needed according to the
|
||||
PITR retention policy
|
||||
|
||||
|
||||
### Backup service
|
||||
TODO: Backup service
|
||||
--------------------
|
||||
|
||||
The backup service, responsible for storing pageserver recovery data externally.
|
||||
The backup service is responsible for periodically pushing the chunks to S3.
|
||||
|
||||
Currently, pageserver stores its files in a filesystem directory it's pointed to.
|
||||
That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
|
||||
Therefore, the server interacts with external, more reliable storage to back up and restore its state.
|
||||
|
||||
The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
|
||||
There are the following implementations present:
|
||||
* local filesystem — to use in tests mainly
|
||||
* AWS S3 - to use in production
|
||||
|
||||
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs.
|
||||
|
||||
The backup service is disabled by default and can be enabled to interact with a single remote storage.
|
||||
|
||||
CLI examples:
|
||||
* Local FS: `${PAGESERVER_BIN} --remote-storage-local-path="/some/local/path/"`
|
||||
* AWS S3 : `${PAGESERVER_BIN} --remote-storage-s3-bucket="some-sample-bucket" --remote-storage-region="eu-north-1" --remote-storage-access-key="SOMEKEYAAAAASADSAH*#" --remote-storage-secret-access-key="SOMEsEcReTsd292v"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
|
||||
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
|
||||
Required sections are:
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
bucket_name = 'some-sample-bucket'
|
||||
bucket_region = 'eu-north-1'
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
```
|
||||
|
||||
Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
|
||||
TODO: How/when do restore from S3? Whenever we get a GetPage@LSN request for
|
||||
a chunk we don't currently have? Or when an external Control Plane tells us?
|
||||
|
||||
TODO: Sharding
|
||||
--------------------
|
||||
@@ -10,10 +10,9 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use log::*;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
@@ -32,7 +31,7 @@ use zenith_utils::lsn::Lsn;
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<dyn Timeline>,
|
||||
pub lsn: Lsn,
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
}
|
||||
|
||||
@@ -84,7 +83,7 @@ impl<'a> Basebackup<'a> {
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={}",
|
||||
backup_lsn, backup_prev
|
||||
backup_prev, backup_lsn
|
||||
);
|
||||
|
||||
Ok(Basebackup {
|
||||
@@ -98,6 +97,7 @@ impl<'a> Basebackup<'a> {
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
// Create pgdata subdirs structure
|
||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||
info!("send subdir {:?}", *dir);
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
self.ar.append(&header, &mut io::empty())?;
|
||||
}
|
||||
@@ -242,16 +242,20 @@ impl<'a> Basebackup<'a> {
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
|
||||
.context("failed get control bytes")?;
|
||||
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)?;
|
||||
let pg_control_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
// Generate new pg_control needed for bootstrap
|
||||
// Generate new pg_control and WAL needed for bootstrap
|
||||
let checkpoint_segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let checkpoint_lsn = XLogSegNoOffsetToRecPtr(
|
||||
checkpoint_segno,
|
||||
XLOG_SIZE_OF_XLOG_LONG_PHD as u32,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
checkpoint.redo = normalize_lsn(self.lsn, pg_constants::WAL_SEGMENT_SIZE).0;
|
||||
|
||||
//reset some fields we don't want to preserve
|
||||
@@ -260,24 +264,19 @@ impl<'a> Basebackup<'a> {
|
||||
checkpoint.oldestActiveXid = 0;
|
||||
|
||||
//save new values in pg_control
|
||||
pg_control.checkPoint = 0;
|
||||
pg_control.checkPoint = checkpoint_lsn;
|
||||
pg_control.checkPointCopy = checkpoint;
|
||||
pg_control.state = pg_constants::DB_SHUTDOWNED;
|
||||
|
||||
// add zenith.signal file
|
||||
let mut zenith_signal = String::new();
|
||||
if self.prev_record_lsn == Lsn(0) {
|
||||
if self.lsn == self.timeline.get_ancestor_lsn() {
|
||||
write!(zenith_signal, "PREV LSN: none")?;
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: invalid")?;
|
||||
}
|
||||
let xl_prev = if self.prev_record_lsn == Lsn(0) {
|
||||
0xBAD0 // magic value to indicate that we don't know prev_lsn
|
||||
} else {
|
||||
write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)?;
|
||||
}
|
||||
self.prev_record_lsn.0
|
||||
};
|
||||
self.ar.append(
|
||||
&new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
|
||||
zenith_signal.as_bytes(),
|
||||
&new_tar_header("zenith.signal", 8)?,
|
||||
&xl_prev.to_le_bytes()[..],
|
||||
)?;
|
||||
|
||||
//send pg_control
|
||||
@@ -286,11 +285,14 @@ impl<'a> Basebackup<'a> {
|
||||
self.ar.append(&header, &pg_control_bytes[..])?;
|
||||
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
checkpoint_segno,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
|
||||
let wal_seg = generate_wal_segment(&pg_control);
|
||||
assert!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
|
||||
@@ -4,14 +4,11 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
.about("Dump contents of one layer file, for debugging")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::with_name("path")
|
||||
.help("Path to file to dump")
|
||||
@@ -22,9 +19,6 @@ fn main() -> Result<()> {
|
||||
|
||||
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
|
||||
dump_layerfile_from_path(&path)?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -2,35 +2,33 @@
|
||||
// Main entry point for the Page Server executable
|
||||
//
|
||||
|
||||
use log::*;
|
||||
use pageserver::defaults::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
env,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
thread,
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{
|
||||
branches, defaults::*, http, page_cache, page_service, remote_storage, tenant_mgr,
|
||||
virtual_file, PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
|
||||
branches,
|
||||
defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR},
|
||||
http, page_service, tenant_mgr, PageServerConf, RelishStorageConfig, S3Config, LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::postgres_backend;
|
||||
use zenith_utils::shutdown::exit_now;
|
||||
use zenith_utils::signals::{self, Signal};
|
||||
|
||||
use const_format::formatcp;
|
||||
|
||||
/// String arguments that can be declared via CLI or config file
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct CfgFileParams {
|
||||
listen_pg_addr: Option<String>,
|
||||
listen_http_addr: Option<String>,
|
||||
@@ -38,29 +36,15 @@ struct CfgFileParams {
|
||||
checkpoint_period: Option<String>,
|
||||
gc_horizon: Option<String>,
|
||||
gc_period: Option<String>,
|
||||
open_mem_limit: Option<String>,
|
||||
page_cache_size: Option<String>,
|
||||
max_file_descriptors: Option<String>,
|
||||
pg_distrib_dir: Option<String>,
|
||||
auth_validation_public_key_path: Option<String>,
|
||||
auth_type: Option<String>,
|
||||
remote_storage_max_concurrent_sync: Option<String>,
|
||||
remote_storage_max_sync_errors: Option<String>,
|
||||
/////////////////////////////////
|
||||
//// Don't put `Option<String>` and other "simple" values below.
|
||||
////
|
||||
/// `Option<RemoteStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
|
||||
/// Values in TOML cannot be defined after tables (other tables can),
|
||||
/// and [`toml`] crate serializes all fields in the order of their appearance.
|
||||
////////////////////////////////
|
||||
remote_storage: Option<RemoteStorage>,
|
||||
// see https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for enum deserialisation examples
|
||||
relish_storage: Option<RelishStorage>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
|
||||
// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
|
||||
#[serde(untagged)]
|
||||
enum RemoteStorage {
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
enum RelishStorage {
|
||||
Local {
|
||||
local_path: String,
|
||||
},
|
||||
@@ -81,37 +65,32 @@ impl CfgFileParams {
|
||||
arg_matches.value_of(arg_name).map(str::to_owned)
|
||||
};
|
||||
|
||||
let remote_storage = if let Some(local_path) = get_arg("remote-storage-local-path") {
|
||||
Some(RemoteStorage::Local { local_path })
|
||||
let relish_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
|
||||
Some(RelishStorage::Local { local_path })
|
||||
} else if let Some((bucket_name, bucket_region)) =
|
||||
get_arg("remote-storage-s3-bucket").zip(get_arg("remote-storage-region"))
|
||||
get_arg("relish-storage-s3-bucket").zip(get_arg("relish-storage-region"))
|
||||
{
|
||||
Some(RemoteStorage::AwsS3 {
|
||||
Some(RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id: get_arg("remote-storage-access-key"),
|
||||
secret_access_key: get_arg("remote-storage-secret-access-key"),
|
||||
access_key_id: get_arg("relish-storage-access-key"),
|
||||
secret_access_key: get_arg("relish-storage-secret-access-key"),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
listen_pg_addr: get_arg("listen_pg_addr"),
|
||||
listen_http_addr: get_arg("listen_http_addr"),
|
||||
listen_pg_addr: get_arg("listen-pg"),
|
||||
listen_http_addr: get_arg("listen-http"),
|
||||
checkpoint_distance: get_arg("checkpoint_distance"),
|
||||
checkpoint_period: get_arg("checkpoint_period"),
|
||||
gc_horizon: get_arg("gc_horizon"),
|
||||
gc_period: get_arg("gc_period"),
|
||||
open_mem_limit: get_arg("open_mem_limit"),
|
||||
page_cache_size: get_arg("page_cache_size"),
|
||||
max_file_descriptors: get_arg("max_file_descriptors"),
|
||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||
auth_type: get_arg("auth-type"),
|
||||
remote_storage,
|
||||
remote_storage_max_concurrent_sync: get_arg("remote-storage-max-concurrent-sync"),
|
||||
remote_storage_max_sync_errors: get_arg("remote-storage-max-sync-errors"),
|
||||
relish_storage,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,21 +104,12 @@ impl CfgFileParams {
|
||||
checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
|
||||
gc_horizon: self.gc_horizon.or(other.gc_horizon),
|
||||
gc_period: self.gc_period.or(other.gc_period),
|
||||
open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
|
||||
page_cache_size: self.page_cache_size.or(other.page_cache_size),
|
||||
max_file_descriptors: self.max_file_descriptors.or(other.max_file_descriptors),
|
||||
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.or(other.auth_validation_public_key_path),
|
||||
auth_type: self.auth_type.or(other.auth_type),
|
||||
remote_storage: self.remote_storage.or(other.remote_storage),
|
||||
remote_storage_max_concurrent_sync: self
|
||||
.remote_storage_max_concurrent_sync
|
||||
.or(other.remote_storage_max_concurrent_sync),
|
||||
remote_storage_max_sync_errors: self
|
||||
.remote_storage_max_sync_errors
|
||||
.or(other.remote_storage_max_sync_errors),
|
||||
relish_storage: self.relish_storage.or(other.relish_storage),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,21 +145,6 @@ impl CfgFileParams {
|
||||
None => DEFAULT_GC_PERIOD,
|
||||
};
|
||||
|
||||
let open_mem_limit: usize = match self.open_mem_limit.as_ref() {
|
||||
Some(open_mem_limit_str) => open_mem_limit_str.parse()?,
|
||||
None => DEFAULT_OPEN_MEM_LIMIT,
|
||||
};
|
||||
|
||||
let page_cache_size: usize = match self.page_cache_size.as_ref() {
|
||||
Some(page_cache_size_str) => page_cache_size_str.parse()?,
|
||||
None => DEFAULT_PAGE_CACHE_SIZE,
|
||||
};
|
||||
|
||||
let max_file_descriptors: usize = match self.max_file_descriptors.as_ref() {
|
||||
Some(max_file_descriptors_str) => max_file_descriptors_str.parse()?,
|
||||
None => DEFAULT_MAX_FILE_DESCRIPTORS,
|
||||
};
|
||||
|
||||
let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
|
||||
Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
|
||||
None => env::current_dir()?.join("tmp_install"),
|
||||
@@ -223,37 +178,25 @@ impl CfgFileParams {
|
||||
);
|
||||
}
|
||||
|
||||
let max_concurrent_sync = match self.remote_storage_max_concurrent_sync.as_deref() {
|
||||
Some(number_str) => number_str.parse()?,
|
||||
None => NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap(),
|
||||
};
|
||||
let max_sync_errors = match self.remote_storage_max_sync_errors.as_deref() {
|
||||
Some(number_str) => number_str.parse()?,
|
||||
None => NonZeroU32::new(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap(),
|
||||
};
|
||||
let remote_storage_config = self.remote_storage.as_ref().map(|storage_params| {
|
||||
let storage = match storage_params.clone() {
|
||||
RemoteStorage::Local { local_path } => {
|
||||
RemoteStorageKind::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RemoteStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
};
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
}
|
||||
});
|
||||
let relish_storage_config =
|
||||
self.relish_storage
|
||||
.as_ref()
|
||||
.map(|storage_params| match storage_params.clone() {
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageConfig::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RelishStorageConfig::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
});
|
||||
|
||||
Ok(PageServerConf {
|
||||
daemonize: false,
|
||||
@@ -264,9 +207,6 @@ impl CfgFileParams {
|
||||
checkpoint_period,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
open_mem_limit,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
|
||||
superuser: String::from(DEFAULT_SUPERUSER),
|
||||
|
||||
@@ -276,28 +216,26 @@ impl CfgFileParams {
|
||||
|
||||
auth_validation_public_key_path,
|
||||
auth_type,
|
||||
remote_storage_config,
|
||||
relish_storage_config,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::with_name("listen_pg_addr")
|
||||
Arg::with_name("listen-pg")
|
||||
.short("l")
|
||||
.long("listen_pg_addr")
|
||||
.aliases(&["listen", "listen-pg"]) // keep some compatibility
|
||||
.long("listen-pg")
|
||||
.alias("listen") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen_http_addr")
|
||||
.long("listen_http_addr")
|
||||
.aliases(&["http_endpoint", "listen-http"]) // keep some compatibility
|
||||
Arg::with_name("listen-http")
|
||||
.long("listen-http")
|
||||
.alias("http_endpoint") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
)
|
||||
@@ -338,25 +276,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("open_mem_limit")
|
||||
.long("open_mem_limit")
|
||||
.takes_value(true)
|
||||
.help("Amount of memory reserved for buffering incoming WAL"),
|
||||
)
|
||||
.arg(
|
||||
|
||||
Arg::with_name("page_cache_size")
|
||||
.long("page_cache_size")
|
||||
.takes_value(true)
|
||||
.help("Number of pages in the page cache"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("max_file_descriptors")
|
||||
.long("max_file_descriptors")
|
||||
.takes_value(true)
|
||||
.help("Max number of file descriptors to keep open for files"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("workdir")
|
||||
.short("D")
|
||||
@@ -390,48 +309,42 @@ fn main() -> Result<()> {
|
||||
.help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-local-path")
|
||||
.long("remote-storage-local-path")
|
||||
Arg::with_name("relish-storage-local-path")
|
||||
.long("relish-storage-local-path")
|
||||
.takes_value(true)
|
||||
.help("Path to the local directory, to be used as an external remote storage")
|
||||
.help("Path to the local directory, to be used as an external relish storage")
|
||||
.conflicts_with_all(&[
|
||||
"remote-storage-s3-bucket",
|
||||
"remote-storage-region",
|
||||
"remote-storage-access-key",
|
||||
"remote-storage-secret-access-key",
|
||||
"relish-storage-s3-bucket",
|
||||
"relish-storage-region",
|
||||
"relish-storage-access-key",
|
||||
"relish-storage-secret-access-key",
|
||||
]),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-s3-bucket")
|
||||
.long("remote-storage-s3-bucket")
|
||||
Arg::with_name("relish-storage-s3-bucket")
|
||||
.long("relish-storage-s3-bucket")
|
||||
.takes_value(true)
|
||||
.help("Name of the AWS S3 bucket to use an external remote storage")
|
||||
.requires("remote-storage-region"),
|
||||
.help("Name of the AWS S3 bucket to use an external relish storage")
|
||||
.requires("relish-storage-region"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-region")
|
||||
.long("remote-storage-region")
|
||||
Arg::with_name("relish-storage-region")
|
||||
.long("relish-storage-region")
|
||||
.takes_value(true)
|
||||
.help("Region of the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-access-key")
|
||||
.long("remote-storage-access-key")
|
||||
Arg::with_name("relish-storage-access-key")
|
||||
.long("relish-storage-access-key")
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-secret-access-key")
|
||||
.long("remote-storage-secret-access-key")
|
||||
Arg::with_name("relish-storage-secret-access-key")
|
||||
.long("relish-storage-secret-access-key")
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-max-concurrent-sync")
|
||||
.long("remote-storage-max-concurrent-sync")
|
||||
.takes_value(true)
|
||||
.help("Maximum allowed concurrent synchronisations with storage"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
@@ -488,11 +401,6 @@ fn main() -> Result<()> {
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
|
||||
page_cache::init(conf);
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if init {
|
||||
branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
|
||||
@@ -514,9 +422,7 @@ fn main() -> Result<()> {
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
|
||||
info!("version: {}", GIT_VERSION);
|
||||
let (_scope_guard, log_file) = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -525,15 +431,14 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
"Starting pageserver http handler on {}",
|
||||
conf.listen_http_addr
|
||||
);
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
|
||||
|
||||
info!(
|
||||
"Starting pageserver pg protocol handler on {}",
|
||||
conf.listen_pg_addr
|
||||
);
|
||||
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
|
||||
|
||||
// XXX: Don't spawn any threads before daemonizing!
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
@@ -548,25 +453,18 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
// XXX: The parent process should exit abruptly right after
|
||||
// it has spawned a child to prevent coverage machinery from
|
||||
// dumping stats into a `profraw` file now owned by the child.
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => error!(%err, "could not daemonize"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
|
||||
if let Some(handle) = remote_storage::run_storage_sync_thread(conf)? {
|
||||
threads.push(handle);
|
||||
}
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
|
||||
// keep join handles for spawned threads
|
||||
let mut join_handles = vec![];
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
@@ -581,204 +479,30 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Spawn a new thread for the http endpoint
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
let cloned = auth.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(move || {
|
||||
let router = http::make_router(conf, cloned);
|
||||
endpoint::serve_thread_main(router, http_listener)
|
||||
})?,
|
||||
);
|
||||
let http_endpoint_thread = thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(move || {
|
||||
let router = http::make_router(conf, cloned);
|
||||
endpoint::serve_thread_main(router, http_listener)
|
||||
})?;
|
||||
|
||||
join_handles.push(http_endpoint_thread);
|
||||
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(move || {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?,
|
||||
);
|
||||
let page_service_thread = thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(move || {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?;
|
||||
|
||||
signals.handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
info!(
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
std::process::exit(111);
|
||||
}
|
||||
join_handles.push(page_service_thread);
|
||||
|
||||
Signal::Interrupt | Signal::Terminate => {
|
||||
info!(
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
|
||||
postgres_backend::set_pgbackend_shutdown_requested();
|
||||
tenant_mgr::shutdown_all_tenants()?;
|
||||
endpoint::shutdown();
|
||||
|
||||
for handle in std::mem::take(&mut threads) {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
}
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(0);
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn page_server_conf_toml_serde() {
|
||||
let params = CfgFileParams {
|
||||
listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
|
||||
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
|
||||
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
|
||||
page_cache_size: Some("page_cache_size_VALUE".to_string()),
|
||||
max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
),
|
||||
auth_type: Some("auth_type_VALUE".to_string()),
|
||||
remote_storage: Some(RemoteStorage::Local {
|
||||
local_path: "remote_storage_local_VALUE".to_string(),
|
||||
}),
|
||||
remote_storage_max_concurrent_sync: Some(
|
||||
"remote_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
remote_storage_max_sync_errors: Some(
|
||||
"remote_storage_max_sync_errors_VALUE".to_string(),
|
||||
),
|
||||
};
|
||||
|
||||
let toml_string = toml::to_string(¶ms).expect("Failed to serialize correct config");
|
||||
let toml_pretty_string =
|
||||
toml::to_string_pretty(¶ms).expect("Failed to serialize correct config");
|
||||
assert_eq!(
|
||||
r#"listen_pg_addr = 'listen_pg_addr_VALUE'
|
||||
listen_http_addr = 'listen_http_addr_VALUE'
|
||||
checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
open_mem_limit = 'open_mem_limit_VALUE'
|
||||
page_cache_size = 'page_cache_size_VALUE'
|
||||
max_file_descriptors = 'max_file_descriptors_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
|
||||
remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'
|
||||
|
||||
[remote_storage]
|
||||
local_path = 'remote_storage_local_VALUE'
|
||||
"#,
|
||||
toml_pretty_string
|
||||
);
|
||||
|
||||
let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
|
||||
.expect("Failed to deserialize the serialization result of the config");
|
||||
let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
|
||||
.expect("Failed to deserialize the prettified serialization result of the config");
|
||||
assert!(
|
||||
params_from_serialized == params,
|
||||
"Expected the same config in the end of config -> serialize -> deserialize chain"
|
||||
);
|
||||
assert!(
|
||||
params_from_serialized_pretty == params,
|
||||
"Expected the same config in the end of config -> serialize pretty -> deserialize chain"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn credentials_omitted_during_serialization() {
|
||||
let params = CfgFileParams {
|
||||
listen_pg_addr: Some("listen_pg_addr_VALUE".to_string()),
|
||||
listen_http_addr: Some("listen_http_addr_VALUE".to_string()),
|
||||
checkpoint_distance: Some("checkpoint_distance_VALUE".to_string()),
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
|
||||
page_cache_size: Some("page_cache_size_VALUE".to_string()),
|
||||
max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
),
|
||||
auth_type: Some("auth_type_VALUE".to_string()),
|
||||
remote_storage: Some(RemoteStorage::AwsS3 {
|
||||
bucket_name: "bucket_name_VALUE".to_string(),
|
||||
bucket_region: "bucket_region_VALUE".to_string(),
|
||||
access_key_id: Some("access_key_id_VALUE".to_string()),
|
||||
secret_access_key: Some("secret_access_key_VALUE".to_string()),
|
||||
}),
|
||||
remote_storage_max_concurrent_sync: Some(
|
||||
"remote_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
remote_storage_max_sync_errors: Some(
|
||||
"remote_storage_max_sync_errors_VALUE".to_string(),
|
||||
),
|
||||
};
|
||||
|
||||
let toml_string = toml::to_string(¶ms).expect("Failed to serialize correct config");
|
||||
let toml_pretty_string =
|
||||
toml::to_string_pretty(¶ms).expect("Failed to serialize correct config");
|
||||
assert_eq!(
|
||||
r#"listen_pg_addr = 'listen_pg_addr_VALUE'
|
||||
listen_http_addr = 'listen_http_addr_VALUE'
|
||||
checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
open_mem_limit = 'open_mem_limit_VALUE'
|
||||
page_cache_size = 'page_cache_size_VALUE'
|
||||
max_file_descriptors = 'max_file_descriptors_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
|
||||
remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'
|
||||
|
||||
[remote_storage]
|
||||
bucket_name = 'bucket_name_VALUE'
|
||||
bucket_region = 'bucket_region_VALUE'
|
||||
"#,
|
||||
toml_pretty_string
|
||||
);
|
||||
|
||||
let params_from_serialized: CfgFileParams = toml::from_str(&toml_string)
|
||||
.expect("Failed to deserialize the serialization result of the config");
|
||||
let params_from_serialized_pretty: CfgFileParams = toml::from_str(&toml_pretty_string)
|
||||
.expect("Failed to deserialize the prettified serialization result of the config");
|
||||
|
||||
let mut expected_params = params;
|
||||
expected_params.remote_storage = Some(RemoteStorage::AwsS3 {
|
||||
bucket_name: "bucket_name_VALUE".to_string(),
|
||||
bucket_region: "bucket_region_VALUE".to_string(),
|
||||
access_key_id: None,
|
||||
secret_access_key: None,
|
||||
});
|
||||
assert!(
|
||||
params_from_serialized == expected_params,
|
||||
"Expected the config without credentials in the end of a 'config -> serialize -> deserialize' chain"
|
||||
);
|
||||
assert!(
|
||||
params_from_serialized_pretty == expected_params,
|
||||
"Expected the config without credentials in the end of a 'config -> serialize pretty -> deserialize' chain"
|
||||
);
|
||||
for handle in join_handles.into_iter() {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error")
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
// TODO: move all paths construction to conf impl
|
||||
//
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
@@ -14,16 +14,14 @@ use std::{
|
||||
str::FromStr,
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use log::*;
|
||||
use zenith_utils::logging;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::tenant_mgr;
|
||||
use crate::walredo::WalRedoManager;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::{repository::Repository, PageServerConf};
|
||||
use crate::{restore_local_repo, LOG_FILE_NAME};
|
||||
|
||||
@@ -36,14 +34,15 @@ pub struct BranchInfo {
|
||||
pub ancestor_id: Option<String>,
|
||||
pub ancestor_lsn: Option<String>,
|
||||
pub current_logical_size: usize,
|
||||
pub current_logical_size_non_incremental: Option<usize>,
|
||||
pub current_logical_size_non_incremental: usize,
|
||||
}
|
||||
|
||||
impl BranchInfo {
|
||||
pub fn from_path<T: AsRef<Path>>(
|
||||
path: T,
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
repo: &Arc<dyn Repository>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
) -> Result<Self> {
|
||||
let name = path
|
||||
.as_ref()
|
||||
@@ -56,22 +55,27 @@ impl BranchInfo {
|
||||
|
||||
let timeline = repo.get_timeline(timeline_id)?;
|
||||
|
||||
// we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
|
||||
let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() {
|
||||
Some(ancestor_id) => (
|
||||
Some(ancestor_id.to_string()),
|
||||
Some(timeline.get_ancestor_lsn().to_string()),
|
||||
),
|
||||
None => (None, None),
|
||||
};
|
||||
let ancestor_path = conf.ancestor_path(&timeline_id, tenantid);
|
||||
let mut ancestor_id: Option<String> = None;
|
||||
let mut ancestor_lsn: Option<String> = None;
|
||||
|
||||
// non incremental size calculation can be heavy, so let it be optional
|
||||
// needed for tests to check size calculation
|
||||
let current_logical_size_non_incremental = include_non_incremental_logical_size
|
||||
.then(|| {
|
||||
timeline.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())
|
||||
})
|
||||
.transpose()?;
|
||||
if ancestor_path.exists() {
|
||||
let ancestor = std::fs::read_to_string(ancestor_path)?;
|
||||
let mut strings = ancestor.split('@');
|
||||
|
||||
ancestor_id = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
ancestor_lsn = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(BranchInfo {
|
||||
name,
|
||||
@@ -80,7 +84,8 @@ impl BranchInfo {
|
||||
ancestor_id,
|
||||
ancestor_lsn,
|
||||
current_logical_size: timeline.get_current_logical_size(),
|
||||
current_logical_size_non_incremental,
|
||||
current_logical_size_non_incremental: timeline
|
||||
.get_current_logical_size_non_incremental(timeline.get_last_record_lsn())?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -94,7 +99,7 @@ pub struct PointInTime {
|
||||
pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str>) -> Result<()> {
|
||||
// Initialize logger
|
||||
// use true as daemonize parameter because otherwise we pollute zenith cli output with a few pages long output of info messages
|
||||
let _log_file = logging::init(LOG_FILE_NAME, true)?;
|
||||
let (_scope_guard, _log_file) = logging::init(LOG_FILE_NAME, true)?;
|
||||
|
||||
// We don't use the real WAL redo manager, because we don't want to spawn the WAL redo
|
||||
// process during repository initialization.
|
||||
@@ -113,7 +118,7 @@ pub fn init_pageserver(conf: &'static PageServerConf, create_tenant: Option<&str
|
||||
println!("initializing tenantid {}", tenantid);
|
||||
create_repo(conf, tenantid, dummy_redo_mgr).with_context(|| "failed to create repo")?;
|
||||
}
|
||||
crashsafe_dir::create_dir_all(conf.tenants_path())?;
|
||||
fs::create_dir_all(conf.tenants_path())?;
|
||||
|
||||
println!("pageserver init succeeded");
|
||||
Ok(())
|
||||
@@ -130,32 +135,27 @@ pub fn create_repo(
|
||||
}
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
crashsafe_dir::create_dir_all(&repo_dir)
|
||||
fs::create_dir_all(&repo_dir)
|
||||
.with_context(|| format!("could not create directory {}", repo_dir.display()))?;
|
||||
|
||||
crashsafe_dir::create_dir(conf.timelines_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
crashsafe_dir::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
fs::create_dir(conf.timelines_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.branches_path(&tenantid))?;
|
||||
fs::create_dir_all(conf.tags_path(&tenantid))?;
|
||||
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// create a new timeline directory
|
||||
let timeline_id = ZTimelineId::generate();
|
||||
let timelinedir = conf.timeline_path(&timeline_id, &tenantid);
|
||||
|
||||
crashsafe_dir::create_dir(&timelinedir)?;
|
||||
let tli = create_timeline(conf, None, &tenantid)?;
|
||||
|
||||
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
|
||||
conf,
|
||||
wal_redo_manager,
|
||||
tenantid,
|
||||
false,
|
||||
));
|
||||
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;
|
||||
bootstrap_timeline(conf, tenantid, tli, &*repo)?;
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
@@ -174,16 +174,13 @@ fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
|
||||
// to get bootstrap data for timeline initialization.
|
||||
//
|
||||
fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
info!("running initdb in {}... ", initdbpath.display());
|
||||
info!("running initdb... ");
|
||||
|
||||
let initdb_path = conf.pg_bin_dir().join("initdb");
|
||||
let initdb_output = Command::new(initdb_path)
|
||||
.args(&["-D", initdbpath.to_str().unwrap()])
|
||||
.args(&["-U", &conf.superuser])
|
||||
.arg("--no-instructions")
|
||||
// This is only used for a temporary installation that is deleted shortly after,
|
||||
// so no need to fsync it
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
@@ -196,6 +193,7 @@ fn run_initdb(conf: &'static PageServerConf, initdbpath: &Path) -> Result<()> {
|
||||
String::from_utf8_lossy(&initdb_output.stderr)
|
||||
);
|
||||
}
|
||||
info!("initdb succeeded");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -210,8 +208,6 @@ fn bootstrap_timeline(
|
||||
tli: ZTimelineId,
|
||||
repo: &dyn Repository,
|
||||
) -> Result<()> {
|
||||
let _enter = info_span!("bootstrapping", timeline = %tli, tenant = %tenantid).entered();
|
||||
|
||||
let initdb_path = conf.tenant_path(&tenantid).join("tmp");
|
||||
|
||||
// Init temporarily repo to get bootstrap data
|
||||
@@ -220,17 +216,13 @@ fn bootstrap_timeline(
|
||||
|
||||
let lsn = get_lsn_from_controlfile(&pgdata_path)?.align();
|
||||
|
||||
info!("bootstrap_timeline {:?} at lsn {}", pgdata_path, lsn);
|
||||
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
// Initdb lsn will be equal to last_record_lsn which will be set after import.
|
||||
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
|
||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.writer().as_ref(),
|
||||
lsn,
|
||||
)?;
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
let timeline = repo.create_empty_timeline(tli)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(&pgdata_path, &*timeline, lsn)?;
|
||||
timeline.checkpoint()?;
|
||||
|
||||
println!(
|
||||
"created initial timeline {} timeline.lsn {}",
|
||||
@@ -248,38 +240,29 @@ fn bootstrap_timeline(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
include_non_incremental_logical_size: bool,
|
||||
) -> Result<Vec<BranchInfo>> {
|
||||
pub(crate) fn get_tenants(conf: &PageServerConf) -> Result<Vec<String>> {
|
||||
let tenants_dir = conf.tenants_path();
|
||||
|
||||
std::fs::read_dir(&tenants_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res?;
|
||||
ensure!(dir_entry.file_type()?.is_dir());
|
||||
Ok(dir_entry.file_name().to_str().unwrap().to_owned())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub(crate) fn get_branches(conf: &PageServerConf, tenantid: &ZTenantId) -> Result<Vec<BranchInfo>> {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(*tenantid)?;
|
||||
|
||||
// Each branch has a corresponding record (text file) in the refs/branches
|
||||
// with timeline_id.
|
||||
let branches_dir = conf.branches_path(tenantid);
|
||||
|
||||
std::fs::read_dir(&branches_dir)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Found no branches directory '{}' for tenant {}",
|
||||
branches_dir.display(),
|
||||
tenantid
|
||||
)
|
||||
})?
|
||||
std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res.with_context(|| {
|
||||
format!(
|
||||
"Failed to list branches directory '{}' content for tenant {}",
|
||||
branches_dir.display(),
|
||||
tenantid
|
||||
)
|
||||
})?;
|
||||
BranchInfo::from_path(
|
||||
dir_entry.path(),
|
||||
&repo,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
let dir_entry = dir_entry_res?;
|
||||
BranchInfo::from_path(dir_entry.path(), conf, tenantid, &repo)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -322,26 +305,26 @@ pub(crate) fn create_branch(
|
||||
);
|
||||
}
|
||||
|
||||
let new_timeline_id = ZTimelineId::generate();
|
||||
// create a new timeline directory for it
|
||||
let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
|
||||
|
||||
// Forward entire timeline creation routine to repository
|
||||
// backend, so it can do all needed initialization
|
||||
repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;
|
||||
// Let the Repository backend do its initialization
|
||||
repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
|
||||
|
||||
// Remember the human-readable branch name for the new timeline.
|
||||
// FIXME: there's a race condition, if you create a branch with the same
|
||||
// name concurrently.
|
||||
let data = new_timeline_id.to_string();
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(branchname, tenantid), data)?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
timeline_id: new_timeline_id,
|
||||
timeline_id: newtli,
|
||||
latest_valid_lsn: startpoint.lsn,
|
||||
ancestor_id: Some(startpoint.timelineid.to_string()),
|
||||
ancestor_lsn: Some(startpoint.lsn.to_string()),
|
||||
ancestor_id: None,
|
||||
ancestor_lsn: None,
|
||||
current_logical_size: 0,
|
||||
current_logical_size_non_incremental: Some(0),
|
||||
current_logical_size_non_incremental: 0,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -417,3 +400,25 @@ fn parse_point_in_time(
|
||||
|
||||
bail!("could not parse point-in-time {}", s);
|
||||
}
|
||||
|
||||
fn create_timeline(
|
||||
conf: &PageServerConf,
|
||||
ancestor: Option<PointInTime>,
|
||||
tenantid: &ZTenantId,
|
||||
) -> Result<ZTimelineId> {
|
||||
// Create initial timeline
|
||||
|
||||
let timelineid = ZTimelineId::generate();
|
||||
|
||||
let timelinedir = conf.timeline_path(&timelineid, tenantid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
fs::create_dir(&timelinedir.join("wal"))?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
@@ -25,11 +25,6 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
get:
|
||||
description: Get branches for tenant
|
||||
responses:
|
||||
@@ -78,11 +73,6 @@ paths:
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
get:
|
||||
description: Get branches for tenant
|
||||
responses:
|
||||
@@ -174,13 +164,13 @@ paths:
|
||||
description: Get tenants list
|
||||
responses:
|
||||
"200":
|
||||
description: TenantInfo
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
type: string
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
@@ -253,16 +243,6 @@ components:
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
schemas:
|
||||
TenantInfo:
|
||||
type: object
|
||||
required:
|
||||
- id
|
||||
- state
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
state:
|
||||
type: string
|
||||
BranchInfo:
|
||||
type: object
|
||||
required:
|
||||
@@ -270,6 +250,7 @@ components:
|
||||
- timeline_id
|
||||
- latest_valid_lsn
|
||||
- current_logical_size
|
||||
- current_logical_size_non_incremental
|
||||
properties:
|
||||
name:
|
||||
type: string
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Result;
|
||||
@@ -5,7 +6,6 @@ use hyper::header;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use routerify::{ext::RequestExt, RouterBuilder};
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
use zenith_utils::http::endpoint::auth_middleware;
|
||||
@@ -15,8 +15,6 @@ use zenith_utils::http::{
|
||||
endpoint,
|
||||
error::HttpErrorBody,
|
||||
json::{json_request, json_response},
|
||||
request::get_request_param,
|
||||
request::parse_request_param,
|
||||
};
|
||||
|
||||
use super::models::BranchCreateRequest;
|
||||
@@ -58,6 +56,33 @@ fn get_config(request: &Request<Body>) -> &'static PageServerConf {
|
||||
get_state(request).conf
|
||||
}
|
||||
|
||||
fn get_request_param<'a>(
|
||||
request: &'a Request<Body>,
|
||||
param_name: &str,
|
||||
) -> Result<&'a str, ApiError> {
|
||||
match request.param(param_name) {
|
||||
Some(arg) => Ok(arg),
|
||||
None => {
|
||||
return Err(ApiError::BadRequest(format!(
|
||||
"no {} specified in path param",
|
||||
param_name
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_request_param<T: FromStr>(
|
||||
request: &Request<Body>,
|
||||
param_name: &str,
|
||||
) -> Result<T, ApiError> {
|
||||
match get_request_param(request, param_name)?.parse() {
|
||||
Ok(v) => Ok(v),
|
||||
Err(_) => Err(ApiError::BadRequest(
|
||||
"failed to parse tenant id".to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
// healthcheck handler
|
||||
async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
Ok(Response::builder()
|
||||
@@ -73,7 +98,6 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
check_permission(&request, Some(request_data.tenant_id))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("/branch_create", name = %request_data.name, tenant = %request_data.tenant_id, startpoint=%request_data.start_point).entered();
|
||||
branches::create_branch(
|
||||
get_config(&request),
|
||||
&request_data.name,
|
||||
@@ -86,53 +110,29 @@ async fn branch_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
Ok(json_response(StatusCode::CREATED, response_data)?)
|
||||
}
|
||||
|
||||
// Gate non incremental logical size calculation behind a flag
|
||||
// after pgbench -i -s100 calculation took 28ms so if multiplied by the number of timelines
|
||||
// and tenants it can take noticeable amount of time. Also the value currently used only in tests
|
||||
fn get_include_non_incremental_logical_size(request: &Request<Body>) -> bool {
|
||||
request
|
||||
.uri()
|
||||
.query()
|
||||
.map(|v| {
|
||||
url::form_urlencoded::parse(v.as_bytes())
|
||||
.into_owned()
|
||||
.any(|(param, _)| param == "include-non-incremental-logical-size")
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
async fn branch_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
|
||||
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
|
||||
|
||||
check_permission(&request, Some(tenantid))?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_list", tenant = %tenantid).entered();
|
||||
crate::branches::get_branches(
|
||||
get_config(&request),
|
||||
&tenantid,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
crate::branches::get_branches(get_config(&request), &tenantid)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
// TODO add to swagger
|
||||
async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenantid: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
let branch_name: String = get_request_param(&request, "branch_name")?.to_string();
|
||||
let branch_name: &str = get_request_param(&request, "branch_name")?;
|
||||
let conf = get_state(&request).conf;
|
||||
let path = conf.branch_path(&branch_name, &tenantid);
|
||||
|
||||
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
|
||||
let path = conf.branch_path(branch_name, &tenantid);
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
BranchInfo::from_path(path, &repo, include_non_incremental_logical_size)
|
||||
BranchInfo::from_path(path, conf, &tenantid, &repo)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -144,13 +144,10 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
crate::tenant_mgr::list_tenants()
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let response_data =
|
||||
tokio::task::spawn_blocking(move || crate::branches::get_tenants(get_config(&request)))
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
Ok(json_response(StatusCode::OK, response_data)?)
|
||||
}
|
||||
|
||||
@@ -161,7 +158,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = %request_data.tenant_id).entered();
|
||||
tenant_mgr::create_repository_for_tenant(get_config(&request), request_data.tenant_id)
|
||||
})
|
||||
.await
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,56 +1,12 @@
|
||||
# Overview
|
||||
|
||||
The on-disk format is based on immutable files. The page server receives a
|
||||
stream of incoming WAL, parses the WAL records to determine which pages they
|
||||
apply to, and accumulates the incoming changes in memory. Every now and then,
|
||||
the accumulated changes are written out to new immutable files. This process is
|
||||
called checkpointing. Old versions of on-disk files that are not needed by any
|
||||
timeline are removed by GC process.
|
||||
|
||||
The main responsibility of the Page Server is to process the incoming WAL, and
|
||||
reprocess it into a format that allows reasonably quick access to any page
|
||||
version.
|
||||
|
||||
The incoming WAL contains updates to arbitrary pages in the system. The
|
||||
distribution depends on the workload: the updates could be totally random, or
|
||||
there could be a long stream of updates to a single relation when data is bulk
|
||||
loaded, for example, or something in between. The page server slices the
|
||||
incoming WAL per relation and page, and packages the sliced WAL into
|
||||
suitably-sized "layer files". The layer files contain all the history of the
|
||||
database, back to some reasonable retention period. This system replaces the
|
||||
base backups and the WAL archive used in a traditional PostgreSQL
|
||||
installation. The layer files are immutable, they are not modified in-place
|
||||
after creation. New layer files are created for new incoming WAL, and old layer
|
||||
files are removed when they are no longer needed. We could also replace layer
|
||||
files with new files that contain the same information, merging small files for
|
||||
example, but that hasn't been implemented yet.
|
||||
|
||||
|
||||
Cloud Storage Page Server Safekeeper
|
||||
Local disk Memory WAL
|
||||
|
||||
|AAAA| |AAAA|AAAA| |AA
|
||||
|BBBB| |BBBB|BBBB| |
|
||||
|CCCC|CCCC| <---- |CCCC|CCCC|CCCC| <--- |CC <---- ADEBAABED
|
||||
|DDDD|DDDD| |DDDD|DDDD| |DDD
|
||||
|EEEE| |EEEE|EEEE|EEEE| |E
|
||||
|
||||
|
||||
In this illustration, WAL is received as a stream from the Safekeeper, from the
|
||||
right. It is immediately captured by the page server and stored quickly in
|
||||
memory. The page server memory can be thought of as a quick "reorder buffer",
|
||||
used to hold the incoming WAL and reorder it so that we keep the WAL records for
|
||||
the same page and relation close to each other.
|
||||
|
||||
From the page server memory, whenever enough WAL has been accumulated for one
|
||||
relation segment, it is moved to local disk, as a new layer file, and the memory
|
||||
is released.
|
||||
|
||||
From the local disk, the layers are further copied to Cloud Storage, for
|
||||
long-term archival. After a layer has been copied to Cloud Storage, it can be
|
||||
removed from local disk, although we currently keep everything locally for fast
|
||||
access. If a layer is needed that isn't found locally, it is fetched from Cloud
|
||||
Storage and stored in local disk.
|
||||
The on-disk format is based on immutable files. The page server
|
||||
receives a stream of incoming WAL, parses the WAL records to determine
|
||||
which pages they apply to, and accumulates the incoming changes in
|
||||
memory. Every now and then, the accumulated changes are written out to
|
||||
new immutable files. This process is called checkpointing. Old versions
|
||||
of on-disk files that are not needed by any timeline are removed by GC
|
||||
process.
|
||||
|
||||
# Terms used in layered repository
|
||||
|
||||
@@ -58,9 +14,32 @@ Storage and stored in local disk.
|
||||
- Segment - one slice of a Relish that is stored in a LayeredTimeline.
|
||||
- Layer - specific version of a relish Segment in a range of LSNs.
|
||||
|
||||
# Layer map
|
||||
Layers can be InMemory or OnDisk:
|
||||
- InMemory layer is not durably stored and needs to rebuild from WAL on pageserver start.
|
||||
- OnDisk layer is durably stored.
|
||||
|
||||
The LayerMap tracks what layers exist for all the relishes in a timeline.
|
||||
OnDisk layers can be Image or Delta:
|
||||
- ImageLayer represents an image or a snapshot of a segment at one particular LSN.
|
||||
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs.
|
||||
|
||||
Dropped segments are always represented on disk by DeltaLayer.
|
||||
|
||||
LSN range defined by start_lsn and end_lsn:
|
||||
- start_lsn is inclusive.
|
||||
- end_lsn is exclusive.
|
||||
|
||||
For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
|
||||
in-memory layer or a delta layer, it is a valid end bound. An image
|
||||
layer represents snapshot at one LSN, so end_lsn is always the
|
||||
snapshot LSN + 1
|
||||
|
||||
Layers can be open or historical:
|
||||
- Open layer is a writeable one. Only InMemory layer can be open.
|
||||
FIXME: If open layer is dropped, it is not writeable, so it should be turned into historical,
|
||||
but now it is not implemented - see bug #569.
|
||||
- Historical layer is the one that cannot be modified anymore. Now only OnDisk layers can be historical.
|
||||
|
||||
- LayerMap - a map that tracks what layers exist for all the relishes in a timeline.
|
||||
|
||||
LayerMap consists of two data structures:
|
||||
- segs - All the layers keyed by segment tag
|
||||
@@ -75,55 +54,8 @@ TODO: Are there any exceptions to this?
|
||||
For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
|
||||
including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.
|
||||
|
||||
|
||||
# Different kinds of layers
|
||||
|
||||
A layer can be in different states:
|
||||
|
||||
- Open - a layer where new WAL records can be appended to.
|
||||
- Closed - a layer that is read-only, no new WAL records can be appended to it
|
||||
- Historic: synonym for closed
|
||||
- InMemory: A layer that needs to be rebuilt from WAL on pageserver start.
|
||||
To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file.
|
||||
- OnDisk: A layer that is stored on disk. If its end-LSN is older than
|
||||
disk_consistent_lsn, it is known to be fully flushed and fsync'd to local disk.
|
||||
- Frozen layer: an in-memory layer that is Closed.
|
||||
|
||||
TODO: Clarify the difference between Closed, Historic and Frozen.
|
||||
|
||||
There are two kinds of OnDisk layers:
|
||||
- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
|
||||
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
|
||||
relish segment.
|
||||
|
||||
Dropped segments are always represented on disk by DeltaLayer.
|
||||
|
||||
# Layer life cycle
|
||||
|
||||
LSN range defined by start_lsn and end_lsn:
|
||||
- start_lsn is inclusive.
|
||||
- end_lsn is exclusive.
|
||||
|
||||
For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen in-memory
|
||||
layer or a delta layer, it is a valid end bound. An image layer represents
|
||||
snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
|
||||
Every layer starts its life as an Open In-Memory layer. When the page server
|
||||
receives the first WAL record for a segment, it creates a new In-Memory layer
|
||||
for it, and puts it to the layer map. Later, the layer is old enough, its
|
||||
contents are written to disk, as On-Disk layers. This process is called
|
||||
"evicting" a layer.
|
||||
|
||||
Layer eviction is a two-step process: First, the layer is marked as closed, so
|
||||
that it no longer accepts new WAL records, and the layer map is updated
|
||||
accordingly. If a new WAL record for that segment arrives after this step, a new
|
||||
Open layer is created to hold it. After this first step, the layer is a Closed
|
||||
InMemory state. This first step is called "freezing" the layer.
|
||||
|
||||
In the second step, new Delta and Image layers are created, containing all the
|
||||
data in the Frozen InMemory layer. When the new layers are ready, the original
|
||||
frozen layer is replaced with the new layers in the layer map, and the original
|
||||
frozen layer is dropped, releasing the memory.
|
||||
TODO:
|
||||
Describe GC and checkpoint interval settings.
|
||||
|
||||
# Layer files (On-disk layers)
|
||||
|
||||
@@ -434,8 +366,6 @@ is a newer layer file there. TODO: This optimization hasn't been
|
||||
implemented! The GC algorithm will currently keep the file on the
|
||||
'main' branch anyway, for as long as the child branch exists.
|
||||
|
||||
TODO:
|
||||
Describe GC and checkpoint interval settings.
|
||||
|
||||
# TODO: On LSN ranges
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::io::{Read, Write};
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::{fs::File, io::Write};
|
||||
|
||||
use anyhow::Result;
|
||||
use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
|
||||
@@ -11,7 +10,7 @@ pub struct BlobRange {
|
||||
size: usize,
|
||||
}
|
||||
|
||||
pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
pub fn read_blob(reader: &BoundedReader<&'_ File>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
let mut buf = vec![0u8; range.size];
|
||||
reader.read_exact_at(&mut buf, range.offset)?;
|
||||
Ok(buf)
|
||||
@@ -29,14 +28,14 @@ impl<W: Write> BlobWriter<W> {
|
||||
Self { writer, offset: 0 }
|
||||
}
|
||||
|
||||
pub fn write_blob_from_reader(&mut self, r: &mut impl Read) -> Result<BlobRange> {
|
||||
let len = std::io::copy(r, &mut self.writer)?;
|
||||
pub fn write_blob(&mut self, blob: &[u8]) -> Result<BlobRange> {
|
||||
self.writer.write_all(blob)?;
|
||||
|
||||
let range = BlobRange {
|
||||
offset: self.offset,
|
||||
size: len as usize,
|
||||
size: blob.len(),
|
||||
};
|
||||
self.offset += len as u64;
|
||||
self.offset += blob.len() as u64;
|
||||
Ok(range)
|
||||
}
|
||||
|
||||
|
||||
@@ -39,26 +39,27 @@
|
||||
//!
|
||||
use crate::layered_repository::blob::BlobWriter;
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::page_versions::PageVersions;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
use std::collections::BTreeMap;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
|
||||
use bookfile::{Book, BookWriter};
|
||||
|
||||
@@ -108,6 +109,12 @@ impl From<&DeltaLayer> for Summary {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct PageVersionMeta {
|
||||
page_image_range: Option<BlobRange>,
|
||||
record_range: Option<BlobRange>,
|
||||
}
|
||||
|
||||
///
|
||||
/// DeltaLayer is the in-memory data structure associated with an
|
||||
/// on-disk delta file. We keep a DeltaLayer in memory for each
|
||||
@@ -132,6 +139,9 @@ pub struct DeltaLayer {
|
||||
|
||||
dropped: bool,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
|
||||
inner: Mutex<DeltaLayerInner>,
|
||||
}
|
||||
|
||||
@@ -140,21 +150,15 @@ pub struct DeltaLayerInner {
|
||||
/// loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
book: Option<Book<VirtualFile>>,
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: VecMap<(u32, Lsn), BlobRange>,
|
||||
page_version_metas: BTreeMap<(u32, Lsn), PageVersionMeta>,
|
||||
|
||||
/// `relsizes` tracks the size of the relation at different points in time.
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
@@ -176,7 +180,29 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
PathBuf::from(
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
Some(Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -184,63 +210,44 @@ impl Layer for DeltaLayer {
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
let mut need_image = true;
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Cached)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
// TODO: avoid opening the file for each read
|
||||
let (_path, book) = self.open_book()?;
|
||||
let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
let inner = self.load()?;
|
||||
let page_version_reader = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.expect("should be loaded in load call above")
|
||||
.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let iter = inner
|
||||
let mut iter = inner
|
||||
.page_version_metas
|
||||
.slice_range((Included(&minkey), Included(&maxkey)))
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, pv_lsn), blob_range) in iter {
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if pv_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Cached)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
// Found a page image, return it
|
||||
reconstruct_data.page_img = Some(img);
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img_range) = &entry.page_image_range {
|
||||
// Found a page image, return it
|
||||
let img = Bytes::from(read_blob(&page_version_reader, img_range)?);
|
||||
reconstruct_data.page_img = Some(img);
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec_range) = &entry.record_range {
|
||||
let rec = WALRecord::des(&read_blob(&page_version_reader, rec_range)?)?;
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push(rec);
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push((*pv_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -248,9 +255,16 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
// caller know about the predecessor layer.
|
||||
if need_image {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
if let Some(cont_layer) = &self.predecessor {
|
||||
Ok(PageReconstructResult::Continue(
|
||||
self.start_lsn,
|
||||
Arc::clone(cont_layer),
|
||||
))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
@@ -259,22 +273,21 @@ impl Layer for DeltaLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let inner = self.load()?;
|
||||
let slice = inner
|
||||
.relsizes
|
||||
.slice_range((Included(&Lsn(0)), Included(&lsn)));
|
||||
let mut iter = inner.relsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
Ok(*entry)
|
||||
let result;
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
result = *entry;
|
||||
// Use the base image if needed
|
||||
} else if let Some(predecessor) = &self.predecessor {
|
||||
result = predecessor.get_seg_size(lsn)?;
|
||||
} else {
|
||||
Err(anyhow::anyhow!("could not find seg size in delta layer"))
|
||||
result = 0;
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Does this segment exist at given LSN?
|
||||
@@ -294,20 +307,17 @@ impl Layer for DeltaLayer {
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.page_version_metas = VecMap::default();
|
||||
inner.relsizes = VecMap::default();
|
||||
inner.page_version_metas = BTreeMap::new();
|
||||
inner.relsizes = BTreeMap::new();
|
||||
inner.loaded = false;
|
||||
|
||||
// Note: we keep the Book open. Is that a good idea? The virtual file
|
||||
// machinery has its own rules for closing the file descriptor if it's not
|
||||
// needed, but the Book struct uses up some memory, too.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
if let Some(path) = self.path() {
|
||||
fs::remove_file(path)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -315,10 +325,6 @@ impl Layer for DeltaLayer {
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
@@ -328,39 +334,32 @@ impl Layer for DeltaLayer {
|
||||
|
||||
println!("--- relsizes ---");
|
||||
let inner = self.load()?;
|
||||
for (k, v) in inner.relsizes.as_slice() {
|
||||
for (k, v) in inner.relsizes.iter() {
|
||||
println!(" {}: {}", k, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
|
||||
let path = self.path();
|
||||
let file = std::fs::File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
for (k, v) in inner.page_version_metas.iter() {
|
||||
let mut desc = String::new();
|
||||
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
rec.rec.len(),
|
||||
rec.will_init,
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
if let Some(page_image_range) = v.page_image_range.as_ref() {
|
||||
let image = read_blob(&chapter, page_image_range)?;
|
||||
write!(&mut desc, " img {} bytes", image.len())?;
|
||||
}
|
||||
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
if let Some(record_range) = v.record_range.as_ref() {
|
||||
let record_bytes = read_blob(&chapter, record_range)?;
|
||||
let rec = WALRecord::des(&record_bytes)?;
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
rec.rec.len(),
|
||||
rec.will_init,
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
println!(" blk {} at {}: {}", k.0, k.1, desc);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -383,14 +382,14 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
/// Create a new delta file, using the given page versions and relsizes.
|
||||
/// The page versions are passed in a PageVersions struct. If 'cutoff' is
|
||||
/// given, only page versions with LSN < cutoff are included.
|
||||
/// The page versions are passed by an iterator; the iterator must return
|
||||
/// page versions in blknum+lsn order.
|
||||
///
|
||||
/// This is used to write the in-memory layer to disk. The page_versions and
|
||||
/// relsizes are thus passed in the same format as they are in the in-memory
|
||||
/// layer, as that's expedient.
|
||||
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create(
|
||||
pub fn create<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
@@ -398,14 +397,10 @@ impl DeltaLayer {
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
page_versions: &PageVersions,
|
||||
cutoff: Option<Lsn>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
page_versions: impl Iterator<Item = (&'a (u32, Lsn), &'a PageVersion)>,
|
||||
relsizes: BTreeMap<Lsn, u32>,
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
assert!(!relsizes.is_empty());
|
||||
}
|
||||
|
||||
let delta_layer = DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
@@ -415,51 +410,65 @@ impl DeltaLayer {
|
||||
end_lsn,
|
||||
dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
loaded: true,
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes,
|
||||
}),
|
||||
predecessor,
|
||||
};
|
||||
let mut inner = delta_layer.inner.lock().unwrap();
|
||||
|
||||
// Write the data into a file
|
||||
//
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
//
|
||||
// Write the in-memory btreemaps into a file
|
||||
let path = delta_layer
|
||||
.path()
|
||||
.expect("DeltaLayer is supposed to have a layer path on disk");
|
||||
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = delta_layer.path();
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let file = File::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
|
||||
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
let page_versions_iter = page_versions.ordered_page_version_iter(cutoff);
|
||||
for (blknum, lsn, pos) in page_versions_iter {
|
||||
let blob_range =
|
||||
page_version_writer.write_blob_from_reader(&mut page_versions.reader(pos)?)?;
|
||||
for (key, page_version) in page_versions {
|
||||
let page_image_range = page_version
|
||||
.page_image
|
||||
.as_ref()
|
||||
.map(|page_image| page_version_writer.write_blob(page_image))
|
||||
.transpose()?;
|
||||
|
||||
inner
|
||||
.page_version_metas
|
||||
.append((blknum, lsn), blob_range)
|
||||
.unwrap();
|
||||
let record_range = page_version
|
||||
.record
|
||||
.as_ref()
|
||||
.map(|record| {
|
||||
let buf = WALRecord::ser(record)?;
|
||||
page_version_writer.write_blob(&buf)
|
||||
})
|
||||
.transpose()?;
|
||||
|
||||
let old = inner.page_version_metas.insert(
|
||||
*key,
|
||||
PageVersionMeta {
|
||||
page_image_range,
|
||||
record_range,
|
||||
},
|
||||
);
|
||||
|
||||
assert!(old.is_none());
|
||||
}
|
||||
|
||||
let book = page_version_writer.close()?;
|
||||
|
||||
// Write out page versions
|
||||
let mut chapter = book.new_chapter(PAGE_VERSION_METAS_CHAPTER);
|
||||
let buf = VecMap::ser(&inner.page_version_metas)?;
|
||||
let buf = BTreeMap::ser(&inner.page_version_metas)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
// and relsizes to separate chapter
|
||||
let mut chapter = book.new_chapter(REL_SIZES_CHAPTER);
|
||||
let buf = VecMap::ser(&inner.relsizes)?;
|
||||
let buf = BTreeMap::ser(&inner.relsizes)?;
|
||||
chapter.write_all(&buf)?;
|
||||
let book = chapter.close()?;
|
||||
|
||||
@@ -478,8 +487,7 @@ impl DeltaLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
@@ -488,6 +496,25 @@ impl DeltaLayer {
|
||||
Ok(delta_layer)
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
},
|
||||
);
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
Ok((path, book))
|
||||
}
|
||||
|
||||
///
|
||||
/// Load the contents of the file into memory
|
||||
///
|
||||
@@ -499,14 +526,7 @@ impl DeltaLayer {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.book.is_none() {
|
||||
let file = VirtualFile::open(&path)?;
|
||||
inner.book = Some(Book::new(file)?);
|
||||
}
|
||||
let book = inner.book.as_ref().unwrap();
|
||||
let (path, book) = self.open_book()?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
@@ -534,16 +554,18 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
let chapter = book.read_chapter(PAGE_VERSION_METAS_CHAPTER)?;
|
||||
let page_version_metas = VecMap::des(&chapter)?;
|
||||
let page_version_metas = BTreeMap::des(&chapter)?;
|
||||
|
||||
let chapter = book.read_chapter(REL_SIZES_CHAPTER)?;
|
||||
let relsizes = VecMap::des(&chapter)?;
|
||||
let relsizes = BTreeMap::des(&chapter)?;
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
inner.page_version_metas = page_version_metas;
|
||||
inner.relsizes = relsizes;
|
||||
inner.loaded = true;
|
||||
*inner = DeltaLayerInner {
|
||||
loaded: true,
|
||||
page_version_metas,
|
||||
relsizes,
|
||||
};
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
@@ -554,6 +576,7 @@ impl DeltaLayer {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
filename: &DeltaFileName,
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
) -> DeltaLayer {
|
||||
DeltaLayer {
|
||||
path_or_conf: PathOrConf::Conf(conf),
|
||||
@@ -565,20 +588,17 @@ impl DeltaLayer {
|
||||
dropped: filename.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
}),
|
||||
predecessor,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
{
|
||||
pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<Self> {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
|
||||
@@ -592,29 +612,10 @@ impl DeltaLayer {
|
||||
dropped: summary.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
page_version_metas: BTreeMap::new(),
|
||||
relsizes: BTreeMap::new(),
|
||||
}),
|
||||
predecessor: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> DeltaFileName {
|
||||
DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: self.end_lsn,
|
||||
dropped: self.dropped,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,298 +0,0 @@
|
||||
//! Implementation of append-only file data structure
|
||||
//! used to keep in-memory layers spilled on disk.
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{ReadBufResult, WriteBufResult};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::PageServerConf;
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
lazy_static! {
|
||||
///
|
||||
/// This is the global cache of file descriptors (File objects).
|
||||
///
|
||||
static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
|
||||
next_file_id: 1,
|
||||
files: HashMap::new(),
|
||||
});
|
||||
}
|
||||
|
||||
pub struct EphemeralFiles {
|
||||
next_file_id: u64,
|
||||
|
||||
files: HashMap<u64, Arc<VirtualFile>>,
|
||||
}
|
||||
|
||||
pub struct EphemeralFile {
|
||||
file_id: u64,
|
||||
_tenantid: ZTenantId,
|
||||
_timelineid: ZTimelineId,
|
||||
file: Arc<VirtualFile>,
|
||||
|
||||
pos: u64,
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
pub fn create(
|
||||
conf: &PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<EphemeralFile, std::io::Error> {
|
||||
let mut l = EPHEMERAL_FILES.write().unwrap();
|
||||
let file_id = l.next_file_id;
|
||||
l.next_file_id += 1;
|
||||
|
||||
let filename = conf
|
||||
.timeline_path(&timelineid, &tenantid)
|
||||
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
|
||||
|
||||
let file = VirtualFile::open_with_options(
|
||||
&filename,
|
||||
OpenOptions::new().read(true).write(true).create(true),
|
||||
)?;
|
||||
let file_rc = Arc::new(file);
|
||||
l.files.insert(file_id, file_rc.clone());
|
||||
|
||||
Ok(EphemeralFile {
|
||||
file_id,
|
||||
_tenantid: tenantid,
|
||||
_timelineid: timelineid,
|
||||
file: file_rc,
|
||||
pos: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
|
||||
let mut off = 0;
|
||||
while off < PAGE_SZ {
|
||||
let n = self
|
||||
.file
|
||||
.read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
|
||||
|
||||
if n == 0 {
|
||||
// Reached EOF. Fill the rest of the buffer with zeros.
|
||||
const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
|
||||
|
||||
buf[off..].copy_from_slice(&ZERO_BUF[off..]);
|
||||
break;
|
||||
}
|
||||
|
||||
off += n as usize;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl FileExt for EphemeralFile {
|
||||
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
let len = min(PAGE_SZ - off, dstbuf.len());
|
||||
|
||||
let read_guard;
|
||||
let mut write_guard;
|
||||
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
|
||||
ReadBufResult::Found(guard) => {
|
||||
read_guard = guard;
|
||||
read_guard.as_ref()
|
||||
}
|
||||
ReadBufResult::NotFound(guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
write_guard = guard;
|
||||
self.fill_buffer(write_guard.deref_mut(), blkno)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// And then fall through to read the requested slice from the
|
||||
// buffer.
|
||||
write_guard.as_ref()
|
||||
}
|
||||
};
|
||||
|
||||
dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
let len = min(PAGE_SZ - off, srcbuf.len());
|
||||
|
||||
let mut write_guard;
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
|
||||
WriteBufResult::Found(guard) => {
|
||||
write_guard = guard;
|
||||
write_guard.deref_mut()
|
||||
}
|
||||
WriteBufResult::NotFound(guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
// TODO: if we're overwriting the whole page, no need to read it in first
|
||||
write_guard = guard;
|
||||
self.fill_buffer(write_guard.deref_mut(), blkno)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// And then fall through to modify it.
|
||||
write_guard.deref_mut()
|
||||
}
|
||||
};
|
||||
|
||||
buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
|
||||
write_guard.mark_dirty();
|
||||
Ok(len)
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for EphemeralFile {
|
||||
fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
|
||||
let n = self.write_at(buf, self.pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> Result<(), std::io::Error> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for EphemeralFile {
|
||||
fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(offset) => {
|
||||
self.pos = offset;
|
||||
}
|
||||
SeekFrom::End(_offset) => {
|
||||
return Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
"SeekFrom::End not supported by EphemeralFile",
|
||||
));
|
||||
}
|
||||
SeekFrom::Current(offset) => {
|
||||
let pos = self.pos as i128 + offset as i128;
|
||||
if pos < 0 {
|
||||
return Err(Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
"offset would be negative",
|
||||
));
|
||||
}
|
||||
if pos > u64::MAX as i128 {
|
||||
return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
|
||||
}
|
||||
self.pos = pos as u64;
|
||||
}
|
||||
}
|
||||
Ok(self.pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// drop all pages from page cache
|
||||
let cache = page_cache::get();
|
||||
cache.drop_buffers_for_ephemeral(self.file_id);
|
||||
|
||||
// remove entry from the hash map
|
||||
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
|
||||
|
||||
// unlink file
|
||||
// FIXME: print error
|
||||
let _ = std::fs::remove_file(&self.file.path);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
"could not write back page, not found in ephemeral files hash",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
fn repo_harness(
|
||||
test_name: &str,
|
||||
) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
|
||||
|
||||
Ok((conf, tenantid, timelineid))
|
||||
}
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
// into a string
|
||||
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
|
||||
let mut buf = Vec::new();
|
||||
buf.resize(len, 0u8);
|
||||
|
||||
efile.read_exact_at(&mut buf, offset)?;
|
||||
|
||||
Ok(String::from_utf8_lossy(&buf)
|
||||
.trim_end_matches('\0')
|
||||
.to_string())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_files() -> Result<(), Error> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
|
||||
|
||||
let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
file_a.write_all(b"foo")?;
|
||||
assert_eq!("foo", read_string(&file_a, 0, 20)?);
|
||||
|
||||
file_a.write_all(b"bar")?;
|
||||
assert_eq!("foobar", read_string(&file_a, 0, 20)?);
|
||||
|
||||
// Open a lot of files, enough to cause some page evictions.
|
||||
let mut efiles = Vec::new();
|
||||
for fileno in 0..100 {
|
||||
let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
efile.write_all(format!("file {}", fileno).as_bytes())?;
|
||||
assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
|
||||
efiles.push((fileno, efile));
|
||||
}
|
||||
|
||||
// Check that all the files can still be read from. Use them in random order for
|
||||
// good measure.
|
||||
efiles.as_mut_slice().shuffle(&mut thread_rng());
|
||||
for (fileno, efile) in efiles.iter_mut() {
|
||||
assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -13,8 +13,6 @@ use anyhow::Result;
|
||||
use log::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::metadata::METADATA_FILE_NAME;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
pub struct DeltaFileName {
|
||||
@@ -37,7 +35,7 @@ impl DeltaFileName {
|
||||
/// Parse a string as a delta file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -170,7 +168,7 @@ impl ImageFileName {
|
||||
/// Parse a string as an image file name. Returns None if the filename does not
|
||||
/// match the expected pattern.
|
||||
///
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
pub fn from_str(fname: &str) -> Option<Self> {
|
||||
let rel;
|
||||
let mut parts;
|
||||
if let Some(rest) = fname.strip_prefix("rel_") {
|
||||
@@ -288,11 +286,11 @@ pub fn list_files(
|
||||
let fname = direntry?.file_name();
|
||||
let fname = fname.to_str().unwrap();
|
||||
|
||||
if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
|
||||
if let Some(deltafilename) = DeltaFileName::from_str(fname) {
|
||||
deltafiles.push(deltafilename);
|
||||
} else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
||||
} else if let Some(imgfilename) = ImageFileName::from_str(fname) {
|
||||
imgfiles.push(imgfilename);
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
} else if fname == "wal" || fname == "metadata" || fname == "ancestor" {
|
||||
// ignore these
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
//!
|
||||
//! Global registry of open layers.
|
||||
//!
|
||||
//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
|
||||
//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
|
||||
//! in-memory layers in the system, and know when we need to evict some to release
|
||||
//! memory.
|
||||
//!
|
||||
//! Each layer is assigned a unique ID when it's registered in the global registry.
|
||||
//! The ID can be used to relocate the layer later, without having to hold locks.
|
||||
//!
|
||||
|
||||
use std::sync::atomic::{AtomicU8, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::inmemory_layer::InMemoryLayer;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const MAX_USAGE_COUNT: u8 = 5;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
|
||||
RwLock::new(InMemoryLayers::default());
|
||||
}
|
||||
|
||||
// TODO these types can probably be smaller
|
||||
#[derive(PartialEq, Eq, Clone, Copy)]
|
||||
pub struct LayerId {
|
||||
index: usize,
|
||||
tag: u64, // to avoid ABA problem
|
||||
}
|
||||
|
||||
enum SlotData {
|
||||
Occupied(Arc<InMemoryLayer>),
|
||||
/// Vacant slots form a linked list, the value is the index
|
||||
/// of the next vacant slot in the list.
|
||||
Vacant(Option<usize>),
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
tag: u64,
|
||||
data: SlotData,
|
||||
usage_count: AtomicU8, // for clock algorithm
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct InMemoryLayers {
|
||||
slots: Vec<Slot>,
|
||||
num_occupied: usize,
|
||||
|
||||
// Head of free-slot list.
|
||||
next_empty_slot_idx: Option<usize>,
|
||||
}
|
||||
|
||||
impl InMemoryLayers {
|
||||
pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
let slot_idx = match self.next_empty_slot_idx {
|
||||
Some(slot_idx) => slot_idx,
|
||||
None => {
|
||||
let idx = self.slots.len();
|
||||
self.slots.push(Slot {
|
||||
tag: 0,
|
||||
data: SlotData::Vacant(None),
|
||||
usage_count: AtomicU8::new(0),
|
||||
});
|
||||
idx
|
||||
}
|
||||
};
|
||||
let slots_len = self.slots.len();
|
||||
|
||||
let slot = &mut self.slots[slot_idx];
|
||||
|
||||
match slot.data {
|
||||
SlotData::Occupied(_) => {
|
||||
panic!("an occupied slot was in the free list");
|
||||
}
|
||||
SlotData::Vacant(next_empty_slot_idx) => {
|
||||
self.next_empty_slot_idx = next_empty_slot_idx;
|
||||
}
|
||||
}
|
||||
|
||||
slot.data = SlotData::Occupied(layer);
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
self.num_occupied += 1;
|
||||
assert!(self.num_occupied <= slots_len);
|
||||
|
||||
LayerId {
|
||||
index: slot_idx,
|
||||
tag: slot.tag,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
|
||||
let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
|
||||
if slot.tag != layer_id.tag {
|
||||
return None;
|
||||
}
|
||||
|
||||
if let SlotData::Occupied(layer) = &slot.data {
|
||||
let _ = slot.usage_count.fetch_update(
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
|old_usage_count| {
|
||||
if old_usage_count < MAX_USAGE_COUNT {
|
||||
Some(old_usage_count + 1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
Some(Arc::clone(layer))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// TODO this won't be a public API in the future
|
||||
pub fn remove(&mut self, layer_id: &LayerId) {
|
||||
let slot = &mut self.slots[layer_id.index];
|
||||
|
||||
if slot.tag != layer_id.tag {
|
||||
return;
|
||||
}
|
||||
|
||||
match &slot.data {
|
||||
SlotData::Occupied(_layer) => {
|
||||
// TODO evict the layer
|
||||
}
|
||||
SlotData::Vacant(_) => unimplemented!(),
|
||||
}
|
||||
|
||||
slot.data = SlotData::Vacant(self.next_empty_slot_idx);
|
||||
self.next_empty_slot_idx = Some(layer_id.index);
|
||||
|
||||
assert!(self.num_occupied > 0);
|
||||
self.num_occupied -= 1;
|
||||
|
||||
slot.tag = slot.tag.wrapping_add(1);
|
||||
}
|
||||
}
|
||||
@@ -27,15 +27,15 @@ use crate::layered_repository::storage_layer::{
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::RELISH_SEG_SIZE;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::convert::TryInto;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
@@ -104,8 +104,9 @@ enum ImageType {
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If None, the 'image_type' has not been loaded into memory yet.
|
||||
book: Option<Book<VirtualFile>>,
|
||||
/// If false, the 'image_type' has not been
|
||||
/// loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
/// Derived from filename and bookfile chapter metadata
|
||||
image_type: ImageType,
|
||||
@@ -113,11 +114,25 @@ pub struct ImageLayerInner {
|
||||
|
||||
impl Layer for ImageLayer {
|
||||
fn filename(&self) -> PathBuf {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
PathBuf::from(
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
.to_string(),
|
||||
)
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
Some(Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
@@ -146,20 +161,16 @@ impl Layer for ImageLayer {
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
assert!(lsn >= self.lsn);
|
||||
|
||||
match cached_img_lsn {
|
||||
Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let base_blknum = blknum % RELISH_SEG_SIZE;
|
||||
|
||||
let (_path, book) = self.open_book()?;
|
||||
|
||||
let buf = match &inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => {
|
||||
if base_blknum >= *num_blocks {
|
||||
@@ -169,23 +180,14 @@ impl Layer for ImageLayer {
|
||||
let mut buf = vec![0u8; BLOCK_SIZE];
|
||||
let offset = BLOCK_SIZE as u64 * base_blknum as u64;
|
||||
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
chapter.read_exact_at(&mut buf, offset)?;
|
||||
|
||||
buf
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
ensure!(base_blknum == 0);
|
||||
inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
|
||||
.into_vec()
|
||||
book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?.into_vec()
|
||||
}
|
||||
};
|
||||
|
||||
@@ -207,13 +209,22 @@ impl Layer for ImageLayer {
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
///
|
||||
/// Release most of the memory used by this layer. If it's accessed again later,
|
||||
/// it will need to be loaded back.
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.image_type = ImageType::Blocky { num_blocks: 0 };
|
||||
inner.loaded = false;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete(&self) -> Result<()> {
|
||||
// delete underlying file
|
||||
fs::remove_file(self.path())?;
|
||||
if let Some(path) = self.path() {
|
||||
fs::remove_file(path)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -221,10 +232,6 @@ impl Layer for ImageLayer {
|
||||
false
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
@@ -237,11 +244,8 @@ impl Layer for ImageLayer {
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
|
||||
ImageType::NonBlocky => {
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
println!("non-blocky ({} bytes)", chapter.len());
|
||||
}
|
||||
}
|
||||
@@ -289,22 +293,19 @@ impl ImageLayer {
|
||||
seg,
|
||||
lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
loaded: true,
|
||||
image_type: image_type.clone(),
|
||||
}),
|
||||
};
|
||||
let inner = layer.inner.lock().unwrap();
|
||||
|
||||
// Write the images into a file
|
||||
//
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
//
|
||||
let path = layer
|
||||
.path()
|
||||
.expect("ImageLayer is supposed to have a layer path on disk");
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = layer.path();
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let file = File::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
@@ -336,10 +337,9 @@ impl ImageLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", path.display());
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
drop(inner);
|
||||
|
||||
@@ -391,19 +391,11 @@ impl ImageLayer {
|
||||
// quick exit if already loaded
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
if inner.book.is_some() {
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let path = self.path();
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
|
||||
let book = Book::new(file).with_context(|| {
|
||||
format!(
|
||||
"Failed to open virtual file '{}' as a bookfile",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
let (path, book) = self.open_book()?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
@@ -444,13 +436,30 @@ impl ImageLayer {
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
*inner = ImageLayerInner {
|
||||
book: Some(book),
|
||||
loaded: true,
|
||||
image_type,
|
||||
};
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
Ok((path, book))
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -465,7 +474,7 @@ impl ImageLayer {
|
||||
seg: filename.seg,
|
||||
lsn: filename.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
loaded: false,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
}),
|
||||
}
|
||||
@@ -474,10 +483,7 @@ impl ImageLayer {
|
||||
/// Create an ImageLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<ImageLayer>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
{
|
||||
pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<ImageLayer> {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
|
||||
@@ -488,26 +494,9 @@ impl ImageLayer {
|
||||
seg: summary.seg,
|
||||
lsn: summary.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
loaded: false,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
fn layer_name(&self) -> ImageFileName {
|
||||
ImageFileName {
|
||||
seg: self.seg,
|
||||
lsn: self.lsn,
|
||||
}
|
||||
}
|
||||
|
||||
/// Path to the layer file in pageserver workdir.
|
||||
pub fn path(&self) -> PathBuf {
|
||||
Self::path_for(
|
||||
&self.path_or_conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&self.layer_name(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
//! An in-memory layer stores recently received PageVersions.
|
||||
//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
|
||||
//! and layers can be spilled to disk into ephemeral files.
|
||||
//!
|
||||
//! And there's another BTreeMap to track the size of the relation.
|
||||
//! An in-memory layer stores recently received page versions in memory. The page versions
|
||||
//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
|
||||
//!
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
use crate::layered_repository::filename::DeltaFileName;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
|
||||
@@ -15,15 +12,17 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{ensure, Result};
|
||||
use anyhow::{bail, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
use super::page_versions::PageVersions;
|
||||
use zenith_utils::accum::Accum;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -37,87 +36,90 @@ pub struct InMemoryLayer {
|
||||
///
|
||||
start_lsn: Lsn,
|
||||
|
||||
/// Frozen in-memory layers have an inclusive end LSN.
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// LSN of the oldest page version stored in this layer
|
||||
oldest_pending_lsn: Lsn,
|
||||
|
||||
/// The above fields never change. The parts that do change are in 'inner',
|
||||
/// and protected by mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
/// Predecessor layer might be needed?
|
||||
incremental: bool,
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
/// If this relation was dropped, remember when that happened.
|
||||
/// The drop LSN is recorded in [`end_lsn`].
|
||||
dropped: bool,
|
||||
drop_lsn: Option<Lsn>,
|
||||
|
||||
///
|
||||
/// All versions of all pages in the layer are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
///
|
||||
page_versions: PageVersions,
|
||||
page_versions: BTreeMap<(u32, Lsn), PageVersion>,
|
||||
|
||||
///
|
||||
/// `segsizes` tracks the size of the segment at different points in time.
|
||||
///
|
||||
/// For a blocky rel, there is always one entry, at the layer's start_lsn,
|
||||
/// so that determining the size never depends on the predecessor layer. For
|
||||
/// a non-blocky rel, 'segsizes' is not used and is always empty.
|
||||
///
|
||||
segsizes: VecMap<Lsn, u32>,
|
||||
segsizes: BTreeMap<Lsn, u32>,
|
||||
|
||||
/// Writes are only allowed when true.
|
||||
/// Set to false when this layer is in the process of being replaced.
|
||||
writeable: bool,
|
||||
|
||||
/// Predecessor layer
|
||||
predecessor: Option<Arc<dyn Layer>>,
|
||||
}
|
||||
|
||||
impl InMemoryLayerInner {
|
||||
fn assert_writeable(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
fn check_writeable(&self) -> WriteResult<()> {
|
||||
if self.writeable {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(NonWriteableError)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_seg_size(&self, lsn: Lsn) -> u32 {
|
||||
// Scan the BTreeMap backwards, starting from the given entry.
|
||||
let slice = self.segsizes.slice_range(..=lsn);
|
||||
let mut iter = self.segsizes.range((Included(&Lsn(0)), Included(&lsn)));
|
||||
|
||||
// We make sure there is always at least one entry
|
||||
if let Some((_entry_lsn, entry)) = slice.last() {
|
||||
if let Some((_entry_lsn, entry)) = iter.next_back() {
|
||||
*entry
|
||||
} else {
|
||||
panic!("could not find seg size in in-memory layer");
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
// An in-memory layer can be spilled to disk into ephemeral file,
|
||||
// This function is used only for debugging, so we don't need to be very precise.
|
||||
// Construct a filename as if it was a delta layer.
|
||||
// An in-memory layer doesn't really have a filename as it's not stored on disk,
|
||||
// but we construct a filename as if it was a delta layer
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_lsn;
|
||||
if let Some(drop_lsn) = inner.end_lsn {
|
||||
let dropped;
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
end_lsn = drop_lsn;
|
||||
dropped = true;
|
||||
} else {
|
||||
end_lsn = Lsn(u64::MAX);
|
||||
dropped = false;
|
||||
}
|
||||
|
||||
let delta_filename = DeltaFileName {
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn,
|
||||
dropped: inner.dropped,
|
||||
dropped,
|
||||
}
|
||||
.to_string();
|
||||
|
||||
PathBuf::from(format!("inmem-{}", delta_filename))
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
fn path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
@@ -133,10 +135,14 @@ impl Layer for InMemoryLayer {
|
||||
}
|
||||
|
||||
fn get_end_lsn(&self) -> Lsn {
|
||||
if let Some(end_lsn) = self.end_lsn {
|
||||
return Lsn(end_lsn.0 + 1);
|
||||
}
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
end_lsn
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
drop_lsn
|
||||
} else {
|
||||
Lsn(u64::MAX)
|
||||
}
|
||||
@@ -144,7 +150,7 @@ impl Layer for InMemoryLayer {
|
||||
|
||||
fn is_dropped(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.dropped
|
||||
inner.drop_lsn.is_some()
|
||||
}
|
||||
|
||||
/// Look up given page in the cache.
|
||||
@@ -152,55 +158,50 @@ impl Layer for InMemoryLayer {
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
let mut need_image = true;
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
let predecessor: Option<Arc<dyn Layer>>;
|
||||
|
||||
{
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
// Scan the page versions backwards, starting from `lsn`.
|
||||
let iter = inner
|
||||
// Scan the BTreeMap backwards, starting from reconstruct_data.lsn.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
let maxkey = (blknum, lsn);
|
||||
let mut iter = inner
|
||||
.page_versions
|
||||
.get_block_lsn_range(blknum, ..=lsn)
|
||||
.iter()
|
||||
.rev();
|
||||
for (entry_lsn, pos) in iter {
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if entry_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Cached)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = inner.page_versions.get_page_version(*pos)?;
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
reconstruct_data.page_img = Some(img);
|
||||
.range((Included(&minkey), Included(&maxkey)));
|
||||
while let Some(((_blknum, _entry_lsn), entry)) = iter.next_back() {
|
||||
if let Some(img) = &entry.page_image {
|
||||
reconstruct_data.page_img = Some(img.clone());
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec) = &entry.record {
|
||||
reconstruct_data.records.push(rec.clone());
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
reconstruct_data.records.push((*entry_lsn, rec.clone()));
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
predecessor = inner.predecessor.clone();
|
||||
// release lock on 'inner'
|
||||
}
|
||||
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know
|
||||
// caller know about the predecessor layer.
|
||||
if need_image {
|
||||
if self.incremental {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
if let Some(cont_layer) = predecessor {
|
||||
Ok(PageReconstructResult::Continue(self.start_lsn, cont_layer))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Missing(self.start_lsn))
|
||||
}
|
||||
@@ -212,10 +213,6 @@ impl Layer for InMemoryLayer {
|
||||
/// Get size of the relation at given LSN
|
||||
fn get_seg_size(&self, lsn: Lsn) -> Result<u32> {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
ensure!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"get_seg_size() called on a non-blocky rel"
|
||||
);
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
Ok(inner.get_seg_size(lsn))
|
||||
@@ -231,13 +228,9 @@ impl Layer for InMemoryLayer {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
if inner.dropped {
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
panic!("dropped in-memory layer with no end LSN");
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
if lsn >= drop_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -255,15 +248,12 @@ impl Layer for InMemoryLayer {
|
||||
/// Nothing to do here. When you drop the last reference to the layer, it will
|
||||
/// be deallocated.
|
||||
fn delete(&self) -> Result<()> {
|
||||
panic!("can't delete an InMemoryLayer")
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.incremental
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
true
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.predecessor.is_some()
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
@@ -271,41 +261,54 @@ impl Layer for InMemoryLayer {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
let end_str = inner
|
||||
.end_lsn
|
||||
.drop_lsn
|
||||
.as_ref()
|
||||
.map(Lsn::to_string)
|
||||
.map(|drop_lsn| drop_lsn.to_string())
|
||||
.unwrap_or_default();
|
||||
|
||||
println!(
|
||||
"----- in-memory layer for tli {} seg {} {}-{} {} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str, inner.dropped,
|
||||
"----- in-memory layer for tli {} seg {} {}-{} ----",
|
||||
self.timelineid, self.seg, self.start_lsn, end_str
|
||||
);
|
||||
|
||||
for (k, v) in inner.segsizes.as_slice() {
|
||||
for (k, v) in inner.segsizes.iter() {
|
||||
println!("segsizes {}: {}", k, v);
|
||||
}
|
||||
|
||||
for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
let pv = inner.page_versions.get_page_version(pos)?;
|
||||
let pv_description = match pv {
|
||||
PageVersion::Page(_img) => "page",
|
||||
PageVersion::Wal(_rec) => "wal",
|
||||
};
|
||||
|
||||
println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
|
||||
for (k, v) in inner.page_versions.iter() {
|
||||
println!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
k.0,
|
||||
k.1,
|
||||
v.page_image.is_some(),
|
||||
v.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A result of an inmemory layer data being written to disk.
|
||||
pub struct LayersOnDisk {
|
||||
pub delta_layers: Vec<DeltaLayer>,
|
||||
pub image_layers: Vec<ImageLayer>,
|
||||
/// Write failed because the layer is in process of being replaced.
|
||||
/// See [`LayeredTimeline::perform_write_op`] for how to handle this error.
|
||||
#[derive(Debug)]
|
||||
pub struct NonWriteableError;
|
||||
|
||||
pub type WriteResult<T> = std::result::Result<T, NonWriteableError>;
|
||||
|
||||
/// Helper struct to cleanup `InMemoryLayer::freeze` return signature.
|
||||
pub struct FreezeLayers {
|
||||
/// Replacement layer for the layer which freeze was called on.
|
||||
pub frozen: Arc<InMemoryLayer>,
|
||||
/// New open layer containing leftover data.
|
||||
pub open: Option<Arc<InMemoryLayer>>,
|
||||
}
|
||||
|
||||
impl InMemoryLayer {
|
||||
fn assert_not_frozen(&self) {
|
||||
assert!(self.end_lsn.is_none());
|
||||
}
|
||||
|
||||
/// Return the oldest page version that's stored in this layer
|
||||
pub fn get_oldest_pending_lsn(&self) -> Lsn {
|
||||
self.oldest_pending_lsn
|
||||
@@ -329,27 +332,20 @@ impl InMemoryLayer {
|
||||
start_lsn
|
||||
);
|
||||
|
||||
// The segment is initially empty, so initialize 'segsizes' with 0.
|
||||
let mut segsizes = VecMap::default();
|
||||
if seg.rel.is_blocky() {
|
||||
segsizes.append(start_lsn, 0).unwrap();
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
incremental: false,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::new(file),
|
||||
segsizes,
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes: BTreeMap::new(),
|
||||
writeable: true,
|
||||
predecessor: None,
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -357,18 +353,33 @@ impl InMemoryLayer {
|
||||
// Write operations
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
|
||||
pub fn put_wal_record(&self, blknum: u32, rec: WALRecord) -> WriteResult<u32> {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
rec.lsn,
|
||||
PageVersion {
|
||||
page_image: None,
|
||||
record: Some(rec),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Page(img))
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> WriteResult<u32> {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
PageVersion {
|
||||
page_image: Some(img),
|
||||
record: None,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<u32> {
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> WriteResult<u32> {
|
||||
self.assert_not_frozen();
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
trace!(
|
||||
@@ -380,9 +391,9 @@ impl InMemoryLayer {
|
||||
);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.assert_writeable();
|
||||
inner.check_writeable()?;
|
||||
|
||||
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;
|
||||
let old = inner.page_versions.insert((blknum, lsn), pv);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
@@ -418,15 +429,16 @@ impl InMemoryLayer {
|
||||
// subsequent call to initialize the gap page.
|
||||
let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
|
||||
for gapblknum in gapstart..blknum {
|
||||
let zeropv = PageVersion::Page(ZERO_PAGE.clone());
|
||||
let zeropv = PageVersion {
|
||||
page_image: Some(ZERO_PAGE.clone()),
|
||||
record: None,
|
||||
};
|
||||
trace!(
|
||||
"filling gap blk {} with zeros for write of {}",
|
||||
gapblknum,
|
||||
blknum
|
||||
);
|
||||
let old = inner
|
||||
.page_versions
|
||||
.append_or_update_last(gapblknum, lsn, zeropv)?;
|
||||
let old = inner.page_versions.insert((gapblknum, lsn), zeropv);
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
|
||||
if old.is_some() {
|
||||
@@ -437,47 +449,49 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
|
||||
inner.segsizes.insert(lsn, newsize);
|
||||
return Ok(newsize - oldsize);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) {
|
||||
assert!(
|
||||
self.seg.rel.is_blocky(),
|
||||
"put_truncation() called on a non-blocky rel"
|
||||
);
|
||||
pub fn put_truncation(&self, lsn: Lsn, segsize: u32) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.assert_writeable();
|
||||
inner.check_writeable()?;
|
||||
|
||||
// check that this we truncate to a smaller size than segment was before the truncation
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
assert!(segsize < oldsize);
|
||||
|
||||
let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
|
||||
let old = inner.segsizes.insert(lsn, segsize);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
warn!("Inserting truncation, but had an entry for the LSN already");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Remember that the segment was dropped at given LSN
|
||||
pub fn drop_segment(&self, lsn: Lsn) {
|
||||
pub fn drop_segment(&self, lsn: Lsn) -> WriteResult<()> {
|
||||
self.assert_not_frozen();
|
||||
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
assert!(inner.end_lsn.is_none());
|
||||
assert!(!inner.dropped);
|
||||
inner.dropped = true;
|
||||
assert!(self.start_lsn < lsn);
|
||||
inner.end_lsn = Some(lsn);
|
||||
inner.check_writeable()?;
|
||||
|
||||
assert!(inner.drop_lsn.is_none());
|
||||
inner.drop_lsn = Some(lsn);
|
||||
inner.writeable = false;
|
||||
|
||||
trace!("dropped segment {} at {}", self.seg, lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -504,58 +518,137 @@ impl InMemoryLayer {
|
||||
start_lsn,
|
||||
);
|
||||
|
||||
// Copy the segment size at the start LSN from the predecessor layer.
|
||||
let mut segsizes = VecMap::default();
|
||||
// For convenience, copy the segment size from the predecessor layer
|
||||
let mut segsizes = BTreeMap::new();
|
||||
if seg.rel.is_blocky() {
|
||||
let size = src.get_seg_size(start_lsn)?;
|
||||
segsizes.append(start_lsn, size).unwrap();
|
||||
segsizes.insert(start_lsn, size);
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
tenantid,
|
||||
seg,
|
||||
start_lsn,
|
||||
end_lsn: None,
|
||||
oldest_pending_lsn,
|
||||
incremental: true,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::new(file),
|
||||
drop_lsn: None,
|
||||
page_versions: BTreeMap::new(),
|
||||
segsizes,
|
||||
writeable: true,
|
||||
predecessor: Some(src),
|
||||
}),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_writeable(&self) -> bool {
|
||||
let inner = self.inner.read().unwrap();
|
||||
inner.end_lsn.is_none()
|
||||
inner.writeable
|
||||
}
|
||||
|
||||
/// Make the layer non-writeable. Only call once.
|
||||
/// Records the end_lsn for non-dropped layers.
|
||||
/// `end_lsn` is inclusive
|
||||
pub fn freeze(&self, end_lsn: Lsn) {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
/// Splits `self` into two InMemoryLayers: `frozen` and `open`.
|
||||
/// All data up to and including `cutoff_lsn`
|
||||
/// is copied to `frozen`, while the remaining data is copied to `open`.
|
||||
/// After completion, self is non-writeable, but not frozen.
|
||||
pub fn freeze(self: Arc<Self>, cutoff_lsn: Lsn) -> Result<FreezeLayers> {
|
||||
info!(
|
||||
"freezing in memory layer {} on timeline {} at {} (oldest {})",
|
||||
self.filename().display(),
|
||||
self.timelineid,
|
||||
cutoff_lsn,
|
||||
self.oldest_pending_lsn
|
||||
);
|
||||
|
||||
if inner.end_lsn.is_some() {
|
||||
assert!(inner.dropped);
|
||||
} else {
|
||||
assert!(!inner.dropped);
|
||||
assert!(self.start_lsn < end_lsn + 1);
|
||||
inner.end_lsn = Some(Lsn(end_lsn.0 + 1));
|
||||
self.assert_not_frozen();
|
||||
|
||||
if let Some((lsn, _)) = inner.segsizes.as_slice().last() {
|
||||
assert!(lsn <= &end_lsn, "{:?} {:?}", lsn, end_lsn);
|
||||
}
|
||||
let self_ref = self.clone();
|
||||
let mut inner = self_ref.inner.write().unwrap();
|
||||
// Dropped layers don't need any special freeze actions,
|
||||
// they are marked as non-writeable at drop and just
|
||||
// written out to disk by checkpointer.
|
||||
if inner.drop_lsn.is_some() {
|
||||
assert!(!inner.writeable);
|
||||
info!(
|
||||
"freezing in memory layer for {} on timeline {} is dropped at {}",
|
||||
self.seg,
|
||||
self.timelineid,
|
||||
inner.drop_lsn.unwrap()
|
||||
);
|
||||
|
||||
for (_blk, lsn, _pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
assert!(lsn <= end_lsn);
|
||||
// There should be no newer layer that refers this non-writeable layer,
|
||||
// because layer that is created after dropped one represents a new rel.
|
||||
return Ok(FreezeLayers {
|
||||
frozen: self,
|
||||
open: None,
|
||||
});
|
||||
}
|
||||
assert!(inner.writeable);
|
||||
inner.writeable = false;
|
||||
|
||||
// Divide all the page versions into old and new
|
||||
// at the 'cutoff_lsn' point.
|
||||
let mut before_segsizes = BTreeMap::new();
|
||||
let mut after_segsizes = BTreeMap::new();
|
||||
let mut after_oldest_lsn: Accum<Lsn> = Accum(None);
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn > cutoff_lsn {
|
||||
after_segsizes.insert(*lsn, *size);
|
||||
after_oldest_lsn.accum(min, *lsn);
|
||||
} else {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
|
||||
let mut before_page_versions = BTreeMap::new();
|
||||
let mut after_page_versions = BTreeMap::new();
|
||||
for ((blknum, lsn), pv) in inner.page_versions.iter() {
|
||||
if *lsn > cutoff_lsn {
|
||||
after_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
after_oldest_lsn.accum(min, *lsn);
|
||||
} else {
|
||||
before_page_versions.insert((*blknum, *lsn), pv.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let frozen = Arc::new(InMemoryLayer {
|
||||
conf: self.conf,
|
||||
tenantid: self.tenantid,
|
||||
timelineid: self.timelineid,
|
||||
seg: self.seg,
|
||||
start_lsn: self.start_lsn,
|
||||
end_lsn: Some(cutoff_lsn),
|
||||
oldest_pending_lsn: self.start_lsn,
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
drop_lsn: inner.drop_lsn,
|
||||
page_versions: before_page_versions,
|
||||
segsizes: before_segsizes,
|
||||
writeable: false,
|
||||
predecessor: inner.predecessor.clone(),
|
||||
}),
|
||||
});
|
||||
|
||||
let open = if !after_segsizes.is_empty() || !after_page_versions.is_empty() {
|
||||
let mut new_open = Self::create_successor_layer(
|
||||
self.conf,
|
||||
frozen.clone(),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
cutoff_lsn + 1,
|
||||
after_oldest_lsn.0.unwrap(),
|
||||
)?;
|
||||
|
||||
let new_inner = new_open.inner.get_mut().unwrap();
|
||||
new_inner.page_versions.append(&mut after_page_versions);
|
||||
new_inner.segsizes.append(&mut after_segsizes);
|
||||
|
||||
Some(Arc::new(new_open))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(FreezeLayers { frozen, open })
|
||||
}
|
||||
|
||||
/// Write the this frozen in-memory layer to disk.
|
||||
@@ -566,15 +659,16 @@ impl InMemoryLayer {
|
||||
/// WAL records between start and end LSN. (The delta layer is not needed
|
||||
/// when a new relish is created with a single LSN, so that the start and
|
||||
/// end LSN are the same.)
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<LayersOnDisk> {
|
||||
pub fn write_to_disk(&self, timeline: &LayeredTimeline) -> Result<Vec<Arc<dyn Layer>>> {
|
||||
trace!(
|
||||
"write_to_disk {} get_end_lsn is {}",
|
||||
"write_to_disk {} end_lsn is {} get_end_lsn is {}",
|
||||
self.filename().display(),
|
||||
self.end_lsn.unwrap_or(Lsn(0)),
|
||||
self.get_end_lsn()
|
||||
);
|
||||
|
||||
// Grab the lock in read-mode. We hold it over the I/O, but because this
|
||||
// layer is not writeable anymore, no one should be trying to acquire the
|
||||
// layer is not writeable anymore, no one should be trying to aquire the
|
||||
// write lock on it, so we shouldn't block anyone. There's one exception
|
||||
// though: another thread might have grabbed a reference to this layer
|
||||
// in `get_layer_for_write' just before the checkpointer called
|
||||
@@ -583,42 +677,49 @@ impl InMemoryLayer {
|
||||
// would have to wait until we release it. That race condition is very
|
||||
// rare though, so we just accept the potential latency hit for now.
|
||||
let inner = self.inner.read().unwrap();
|
||||
let end_lsn_exclusive = inner.end_lsn.unwrap();
|
||||
assert!(!inner.writeable);
|
||||
|
||||
if inner.dropped {
|
||||
let predecessor = inner.predecessor.clone();
|
||||
|
||||
if let Some(drop_lsn) = inner.drop_lsn {
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive,
|
||||
drop_lsn,
|
||||
true,
|
||||
&inner.page_versions,
|
||||
None,
|
||||
predecessor,
|
||||
inner.page_versions.iter(),
|
||||
inner.segsizes.clone(),
|
||||
)?;
|
||||
trace!(
|
||||
"freeze: created delta layer for dropped segment {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive
|
||||
drop_lsn
|
||||
);
|
||||
return Ok(LayersOnDisk {
|
||||
delta_layers: vec![delta_layer],
|
||||
image_layers: Vec::new(),
|
||||
});
|
||||
return Ok(vec![Arc::new(delta_layer)]);
|
||||
}
|
||||
|
||||
// Since `end_lsn` is inclusive, subtract 1.
|
||||
// We want to make an ImageLayer for the last included LSN,
|
||||
// so the DeltaLayer should exclude that LSN.
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
let end_lsn = self.end_lsn.unwrap();
|
||||
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut before_segsizes = BTreeMap::new();
|
||||
for (lsn, size) in inner.segsizes.iter() {
|
||||
if *lsn <= end_lsn {
|
||||
before_segsizes.insert(*lsn, *size);
|
||||
}
|
||||
}
|
||||
let mut before_page_versions = inner.page_versions.iter().filter(|tup| {
|
||||
let ((_blknum, lsn), _pv) = tup;
|
||||
|
||||
if self.start_lsn != end_lsn_inclusive {
|
||||
let (segsizes, _) = inner.segsizes.split_at(&end_lsn_exclusive);
|
||||
*lsn < end_lsn
|
||||
});
|
||||
|
||||
let mut frozen_layers: Vec<Arc<dyn Layer>> = Vec::new();
|
||||
|
||||
if self.start_lsn != end_lsn {
|
||||
// Write the page versions before the cutoff to disk.
|
||||
let delta_layer = DeltaLayer::create(
|
||||
self.conf,
|
||||
@@ -626,41 +727,35 @@ impl InMemoryLayer {
|
||||
self.tenantid,
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_inclusive,
|
||||
end_lsn,
|
||||
false,
|
||||
&inner.page_versions,
|
||||
Some(end_lsn_inclusive),
|
||||
segsizes,
|
||||
predecessor,
|
||||
before_page_versions,
|
||||
before_segsizes,
|
||||
)?;
|
||||
delta_layers.push(delta_layer);
|
||||
frozen_layers.push(Arc::new(delta_layer));
|
||||
trace!(
|
||||
"freeze: created delta layer {} {}-{}",
|
||||
self.seg,
|
||||
self.start_lsn,
|
||||
end_lsn_inclusive
|
||||
end_lsn
|
||||
);
|
||||
} else {
|
||||
assert!(inner
|
||||
.page_versions
|
||||
.ordered_page_version_iter(None)
|
||||
.next()
|
||||
.is_none());
|
||||
assert!(before_page_versions.next().is_none());
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
|
||||
// Write a new base image layer at the cutoff point
|
||||
let image_layer =
|
||||
ImageLayer::create_from_src(self.conf, timeline, self, end_lsn_inclusive)?;
|
||||
trace!(
|
||||
"freeze: created image layer {} at {}",
|
||||
self.seg,
|
||||
end_lsn_inclusive
|
||||
);
|
||||
let image_layer = ImageLayer::create_from_src(self.conf, timeline, self, end_lsn)?;
|
||||
frozen_layers.push(Arc::new(image_layer));
|
||||
trace!("freeze: created image layer {} at {}", self.seg, end_lsn);
|
||||
|
||||
Ok(LayersOnDisk {
|
||||
delta_layers,
|
||||
image_layers: vec![image_layer],
|
||||
})
|
||||
Ok(frozen_layers)
|
||||
}
|
||||
|
||||
pub fn update_predecessor(&self, predecessor: Arc<dyn Layer>) -> Option<Arc<dyn Layer>> {
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
inner.predecessor.replace(predecessor)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -283,7 +283,6 @@ mod tests {
|
||||
write!(f, "{}", self.val)
|
||||
}
|
||||
}
|
||||
#[rustfmt::skip]
|
||||
fn assert_search(
|
||||
tree: &IntervalTree<MockItem>,
|
||||
key: u32,
|
||||
@@ -292,20 +291,24 @@ mod tests {
|
||||
if let Some(v) = tree.search(key) {
|
||||
let vstr = v.to_string();
|
||||
|
||||
assert!(!expected.is_empty(), "search with {} returned {}, expected None", key, v);
|
||||
assert!(
|
||||
expected.contains(&vstr.as_str()),
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected,
|
||||
);
|
||||
if expected.is_empty() {
|
||||
panic!("search with {} returned {}, expected None", key, v);
|
||||
}
|
||||
|
||||
if !expected.contains(&vstr.as_str()) {
|
||||
panic!(
|
||||
"search with {} returned {}, expected one of: {:?}",
|
||||
key, v, expected
|
||||
);
|
||||
}
|
||||
Some(v)
|
||||
} else {
|
||||
assert!(
|
||||
expected.is_empty(),
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
if !expected.is_empty() {
|
||||
panic!(
|
||||
"search with {} returned None, expected one of {:?}",
|
||||
key, expected
|
||||
);
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,8 +21,6 @@ use std::sync::Arc;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_INMEMORY_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
|
||||
@@ -70,9 +68,7 @@ impl LayerMap {
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
|
||||
segentry
|
||||
.open_layer_id
|
||||
.and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
|
||||
segentry.open.as_ref().map(Arc::clone)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -81,7 +77,7 @@ impl LayerMap {
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
|
||||
let layer_id = segentry.update_open(Arc::clone(&layer));
|
||||
segentry.update_open(Arc::clone(&layer));
|
||||
|
||||
let oldest_pending_lsn = layer.get_oldest_pending_lsn();
|
||||
|
||||
@@ -93,7 +89,7 @@ impl LayerMap {
|
||||
// Also add it to the binary heap
|
||||
let open_layer_entry = OpenLayerEntry {
|
||||
oldest_pending_lsn: layer.get_oldest_pending_lsn(),
|
||||
layer_id,
|
||||
layer,
|
||||
generation: self.current_generation,
|
||||
};
|
||||
self.open_layers.push(open_layer_entry);
|
||||
@@ -101,35 +97,24 @@ impl LayerMap {
|
||||
NUM_INMEMORY_LAYERS.inc();
|
||||
}
|
||||
|
||||
/// Remove an open in-memory layer
|
||||
pub fn remove_open(&mut self, layer_id: LayerId) {
|
||||
// Note: we don't try to remove the entry from the binary heap.
|
||||
// It will be removed lazily by peek_oldest_open() when it's made it to
|
||||
// the top of the heap.
|
||||
/// Remove the oldest in-memory layer
|
||||
pub fn pop_oldest_open(&mut self) {
|
||||
// Pop it from the binary heap
|
||||
let oldest_entry = self.open_layers.pop().unwrap();
|
||||
let segtag = oldest_entry.layer.get_seg_tag();
|
||||
|
||||
let layer_opt = {
|
||||
let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
|
||||
let layer_opt = global_map.get(&layer_id);
|
||||
global_map.remove(&layer_id);
|
||||
// TODO it's bad that a ref can still exist after being evicted from cache
|
||||
layer_opt
|
||||
};
|
||||
|
||||
if let Some(layer) = layer_opt {
|
||||
let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
|
||||
|
||||
if segentry.open_layer_id == Some(layer_id) {
|
||||
// Also remove it from the SegEntry of this segment
|
||||
segentry.open_layer_id = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!layer.is_writeable());
|
||||
assert!(layer.is_dropped());
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
// Also remove it from the SegEntry of this segment
|
||||
let mut segentry = self.segs.get_mut(&segtag).unwrap();
|
||||
if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
|
||||
segentry.open = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!oldest_entry.layer.is_writeable());
|
||||
assert!(oldest_entry.layer.is_dropped());
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
}
|
||||
|
||||
///
|
||||
@@ -214,17 +199,10 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return the oldest in-memory layer, along with its generation number.
|
||||
pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
|
||||
let global_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
|
||||
while let Some(oldest_entry) = self.open_layers.peek() {
|
||||
if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
|
||||
return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
|
||||
} else {
|
||||
self.open_layers.pop();
|
||||
}
|
||||
}
|
||||
None
|
||||
pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
|
||||
self.open_layers
|
||||
.peek()
|
||||
.map(|oldest_entry| (Arc::clone(&oldest_entry.layer), oldest_entry.generation))
|
||||
}
|
||||
|
||||
/// Increment the generation number used to stamp open in-memory layers. Layers
|
||||
@@ -247,12 +225,8 @@ impl LayerMap {
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let Some(open) = &segentry.open_layer_id {
|
||||
if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
|
||||
layer.dump()?;
|
||||
} else {
|
||||
println!("layer not found in global map");
|
||||
}
|
||||
if let Some(open) = &segentry.open {
|
||||
open.dump()?;
|
||||
}
|
||||
|
||||
for layer in segentry.historic.iter() {
|
||||
@@ -285,7 +259,7 @@ impl IntervalItem for dyn Layer {
|
||||
/// IntervalTree.
|
||||
#[derive(Default)]
|
||||
struct SegEntry {
|
||||
open_layer_id: Option<LayerId>,
|
||||
open: Option<Arc<InMemoryLayer>>,
|
||||
historic: IntervalTree<dyn Layer>,
|
||||
}
|
||||
|
||||
@@ -301,10 +275,10 @@ impl SegEntry {
|
||||
}
|
||||
|
||||
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
if let Some(open_layer_id) = &self.open_layer_id {
|
||||
let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
|
||||
if open_layer.get_start_lsn() <= lsn {
|
||||
return Some(open_layer);
|
||||
if let Some(open) = &self.open {
|
||||
if open.get_start_lsn() <= lsn {
|
||||
let x: Arc<dyn Layer> = Arc::clone(open) as _;
|
||||
return Some(x);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,16 +297,11 @@ impl SegEntry {
|
||||
// Set new open layer for a SegEntry.
|
||||
// It's ok to rewrite previous open layer,
|
||||
// but only if it is not writeable anymore.
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
if let Some(prev_open_layer_id) = &self.open_layer_id {
|
||||
if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
|
||||
{
|
||||
assert!(!prev_open_layer.is_writeable());
|
||||
}
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
if let Some(prev_open) = &self.open {
|
||||
assert!(!prev_open.is_writeable());
|
||||
}
|
||||
let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
|
||||
self.open_layer_id = Some(open_layer_id);
|
||||
open_layer_id
|
||||
self.open = Some(layer);
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
@@ -347,9 +316,9 @@ impl SegEntry {
|
||||
/// recently-added entries (i.e after last call to increment_generation()) from older
|
||||
/// entries with the same 'oldest_pending_lsn'.
|
||||
struct OpenLayerEntry {
|
||||
oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
|
||||
generation: u64,
|
||||
layer_id: LayerId,
|
||||
pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
|
||||
pub generation: u64,
|
||||
pub layer: Arc<InMemoryLayer>,
|
||||
}
|
||||
impl Ord for OpenLayerEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
@@ -414,13 +383,6 @@ mod tests {
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
lazy_static! {
|
||||
static ref DUMMY_TIMELINEID: ZTimelineId =
|
||||
ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
|
||||
static ref DUMMY_TENANTID: ZTenantId =
|
||||
ZTenantId::from_str("00000000000000000000000000000000").unwrap();
|
||||
}
|
||||
|
||||
/// Construct a dummy InMemoryLayer for testing
|
||||
fn dummy_inmem_layer(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -431,8 +393,8 @@ mod tests {
|
||||
Arc::new(
|
||||
InMemoryLayer::create(
|
||||
conf,
|
||||
*DUMMY_TIMELINEID,
|
||||
*DUMMY_TENANTID,
|
||||
ZTimelineId::from_str("00000000000000000000000000000000").unwrap(),
|
||||
ZTenantId::from_str("00000000000000000000000000000000").unwrap(),
|
||||
SegmentTag {
|
||||
rel: TESTREL_A,
|
||||
segno,
|
||||
@@ -448,7 +410,6 @@ mod tests {
|
||||
fn test_open_layers() -> Result<()> {
|
||||
let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;
|
||||
|
||||
let mut layers = LayerMap::default();
|
||||
|
||||
@@ -465,10 +426,10 @@ mod tests {
|
||||
// A helper function (closure) to pop the next oldest open entry from the layer map,
|
||||
// and assert that it is what we'd expect
|
||||
let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
|
||||
let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
|
||||
let (l, generation) = layers.peek_oldest_open().unwrap();
|
||||
assert!(l.get_seg_tag().segno == expected_segno);
|
||||
assert!(generation == expected_generation);
|
||||
layers.remove_open(layer_id);
|
||||
layers.pop_oldest_open();
|
||||
};
|
||||
|
||||
assert_pop_layer(0, gen1); // 0x100
|
||||
|
||||
@@ -1,226 +0,0 @@
|
||||
//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
|
||||
use std::{convert::TryInto, path::PathBuf};
|
||||
|
||||
use anyhow::ensure;
|
||||
use zenith_utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
|
||||
// lot. We only store it in the metadata file when we flush *all* the
|
||||
// in-memory data so that 'last_record_lsn' is the same as
|
||||
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
|
||||
// soon as we reprocess at least one record, we will have a valid
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
pub fn new(
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
ancestor_timeline,
|
||||
ancestor_lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
initdb_lsn,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
|
||||
let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
|
||||
/// [`Lsn`] that corresponds to the corresponding timeline directory
|
||||
/// contents, stored locally in the pageserver workdir.
|
||||
pub fn disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
|
||||
pub fn prev_record_lsn(&self) -> Option<Lsn> {
|
||||
self.prev_record_lsn
|
||||
}
|
||||
|
||||
pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
|
||||
self.ancestor_timeline
|
||||
}
|
||||
|
||||
pub fn ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
|
||||
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
|
||||
self.latest_gc_cutoff_lsn
|
||||
}
|
||||
|
||||
pub fn initdb_lsn(&self) -> Lsn {
|
||||
self.initdb_lsn
|
||||
}
|
||||
}
|
||||
|
||||
/// This module is for direct conversion of metadata to bytes and back.
|
||||
/// For a certain metadata, besides the conversion a few verification steps has to
|
||||
/// be done, so all serde derives are hidden from the user, to avoid accidental
|
||||
/// verification-less metadata creation.
|
||||
mod serialize {
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
|
||||
|
||||
use super::TimelineMetadata;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct SeTimelineMetadata<'a> {
|
||||
disk_consistent_lsn: &'a Lsn,
|
||||
prev_record_lsn: &'a Option<Lsn>,
|
||||
ancestor_timeline: &'a Option<ZTimelineId>,
|
||||
ancestor_lsn: &'a Lsn,
|
||||
latest_gc_cutoff_lsn: &'a Lsn,
|
||||
initdb_lsn: &'a Lsn,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
|
||||
fn from(other: &'a TimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: &other.disk_consistent_lsn,
|
||||
prev_record_lsn: &other.prev_record_lsn,
|
||||
ancestor_timeline: &other.ancestor_timeline,
|
||||
ancestor_lsn: &other.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: &other.initdb_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub(super) struct DeTimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl From<DeTimelineMetadata> for TimelineMetadata {
|
||||
fn from(other: DeTimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: other.disk_consistent_lsn,
|
||||
prev_record_lsn: other.prev_record_lsn,
|
||||
ancestor_timeline: other.ancestor_timeline,
|
||||
ancestor_lsn: other.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: other.initdb_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::repository::repo_harness::TIMELINE_ID;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn metadata_serializes_correctly() {
|
||||
let original_metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0x200),
|
||||
prev_record_lsn: Some(Lsn(0x100)),
|
||||
ancestor_timeline: Some(TIMELINE_ID),
|
||||
ancestor_lsn: Lsn(0),
|
||||
latest_gc_cutoff_lsn: Lsn(0),
|
||||
initdb_lsn: Lsn(0),
|
||||
};
|
||||
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
.expect("Should serialize correct metadata to bytes");
|
||||
|
||||
let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.expect("Should deserialize its own bytes");
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata, original_metadata,
|
||||
"Metadata that was serialized to bytes and deserialized back should not change"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,252 +0,0 @@
|
||||
//!
|
||||
//! Data structure to ingest incoming WAL into an append-only file.
|
||||
//!
|
||||
//! - The file is considered temporary, and will be discarded on crash
|
||||
//! - based on a B-tree
|
||||
//!
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::{collections::HashMap, ops::RangeBounds, slice};
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::io::Seek;
|
||||
|
||||
use zenith_utils::{lsn::Lsn, vec_map::VecMap};
|
||||
|
||||
use super::storage_layer::PageVersion;
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
|
||||
const EMPTY_SLICE: &[(Lsn, u64)] = &[];
|
||||
|
||||
pub struct PageVersions {
|
||||
map: HashMap<u32, VecMap<Lsn, u64>>,
|
||||
|
||||
/// The PageVersion structs are stored in a serialized format in this file.
|
||||
/// Each serialized PageVersion is preceded by a 'u32' length field.
|
||||
/// The 'map' stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
}
|
||||
|
||||
impl PageVersions {
|
||||
pub fn new(file: EphemeralFile) -> PageVersions {
|
||||
PageVersions {
|
||||
map: HashMap::new(),
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
page_version: PageVersion,
|
||||
) -> Result<Option<u64>> {
|
||||
// remember starting position
|
||||
let pos = self.file.stream_position()?;
|
||||
|
||||
// make room for the 'length' field by writing zeros as a placeholder.
|
||||
self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
|
||||
|
||||
page_version.ser_into(&mut self.file).unwrap();
|
||||
|
||||
// write the 'length' field.
|
||||
let len = self.file.stream_position()? - pos - 4;
|
||||
let lenbuf = u32::to_ne_bytes(len as u32);
|
||||
self.file.write_all_at(&lenbuf, pos)?;
|
||||
|
||||
let map = self.map.entry(blknum).or_insert_with(VecMap::default);
|
||||
Ok(map.append_or_update_last(lsn, pos as u64).unwrap().0)
|
||||
}
|
||||
|
||||
/// Get all [`PageVersion`]s in a block
|
||||
fn get_block_slice(&self, blknum: u32) -> &[(Lsn, u64)] {
|
||||
self.map
|
||||
.get(&blknum)
|
||||
.map(VecMap::as_slice)
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Get a range of [`PageVersions`] in a block
|
||||
pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(&self, blknum: u32, range: R) -> &[(Lsn, u64)] {
|
||||
self.map
|
||||
.get(&blknum)
|
||||
.map(|vec_map| vec_map.slice_range(range))
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Iterate through [`PageVersion`]s in (block, lsn) order.
|
||||
/// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
|
||||
pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
|
||||
let mut ordered_blocks: Vec<u32> = self.map.keys().cloned().collect();
|
||||
ordered_blocks.sort_unstable();
|
||||
|
||||
let slice = ordered_blocks
|
||||
.first()
|
||||
.map(|&blknum| self.get_block_slice(blknum))
|
||||
.unwrap_or(EMPTY_SLICE);
|
||||
|
||||
OrderedPageVersionIter {
|
||||
page_versions: self,
|
||||
ordered_blocks,
|
||||
cur_block_idx: 0,
|
||||
cutoff_lsn,
|
||||
cur_slice_iter: slice.iter(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a 'Read' that reads the page version at given offset.
|
||||
pub fn reader(&self, pos: u64) -> Result<PageVersionReader, std::io::Error> {
|
||||
// read length
|
||||
let mut lenbuf = [0u8; 4];
|
||||
self.file.read_exact_at(&mut lenbuf, pos)?;
|
||||
let len = u32::from_ne_bytes(lenbuf);
|
||||
|
||||
Ok(PageVersionReader {
|
||||
file: &self.file,
|
||||
pos: pos + 4,
|
||||
end_pos: pos + 4 + len as u64,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_page_version(&self, pos: u64) -> Result<PageVersion> {
|
||||
let mut reader = self.reader(pos)?;
|
||||
Ok(PageVersion::des_from(&mut reader)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageVersionReader<'a> {
|
||||
file: &'a EphemeralFile,
|
||||
pos: u64,
|
||||
end_pos: u64,
|
||||
}
|
||||
|
||||
impl<'a> std::io::Read for PageVersionReader<'a> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
let len = min(buf.len(), (self.end_pos - self.pos) as usize);
|
||||
let n = self.file.read_at(&mut buf[..len], self.pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OrderedPageVersionIter<'a> {
|
||||
page_versions: &'a PageVersions,
|
||||
|
||||
ordered_blocks: Vec<u32>,
|
||||
cur_block_idx: usize,
|
||||
|
||||
cutoff_lsn: Option<Lsn>,
|
||||
|
||||
cur_slice_iter: slice::Iter<'a, (Lsn, u64)>,
|
||||
}
|
||||
|
||||
impl OrderedPageVersionIter<'_> {
|
||||
fn is_lsn_before_cutoff(&self, lsn: &Lsn) -> bool {
|
||||
if let Some(cutoff_lsn) = self.cutoff_lsn.as_ref() {
|
||||
lsn < cutoff_lsn
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for OrderedPageVersionIter<'a> {
|
||||
type Item = (u32, Lsn, u64);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some((lsn, pos)) = self.cur_slice_iter.next() {
|
||||
if self.is_lsn_before_cutoff(lsn) {
|
||||
let blknum = self.ordered_blocks[self.cur_block_idx];
|
||||
return Some((blknum, *lsn, *pos));
|
||||
}
|
||||
}
|
||||
|
||||
let next_block_idx = self.cur_block_idx + 1;
|
||||
let blknum: u32 = *self.ordered_blocks.get(next_block_idx)?;
|
||||
self.cur_block_idx = next_block_idx;
|
||||
self.cur_slice_iter = self.page_versions.get_block_slice(blknum).iter();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::Bytes;
|
||||
|
||||
use super::*;
|
||||
use crate::PageServerConf;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
fn repo_harness(test_name: &str) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId)> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
|
||||
|
||||
Ok((conf, tenantid, timelineid))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ordered_iter() -> Result<()> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("test_ordered_iter")?;
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
let mut page_versions = PageVersions::new(file);
|
||||
|
||||
const BLOCKS: u32 = 1000;
|
||||
const LSNS: u64 = 50;
|
||||
|
||||
let empty_page = Bytes::from_static(&[0u8; 8192]);
|
||||
let empty_page_version = PageVersion::Page(empty_page);
|
||||
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let old = page_versions.append_or_update_last(
|
||||
blknum,
|
||||
Lsn(lsn),
|
||||
empty_page_version.clone(),
|
||||
)?;
|
||||
assert!(old.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
let mut iter = page_versions.ordered_page_version_iter(None);
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
|
||||
const CUTOFF_LSN: Lsn = Lsn(30);
|
||||
let mut iter = page_versions.ordered_page_version_iter(Some(CUTOFF_LSN));
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..CUTOFF_LSN.0 {
|
||||
let (actual_blknum, actual_lsn, _pv) = iter.next().unwrap();
|
||||
assert_eq!(actual_blknum, blknum);
|
||||
assert_eq!(Lsn(lsn), actual_lsn);
|
||||
}
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -4,12 +4,13 @@
|
||||
|
||||
use crate::relish::RelishTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
@@ -51,10 +52,23 @@ impl SegmentTag {
|
||||
///
|
||||
/// A page version can be stored as a full page image, or as WAL record that needs
|
||||
/// to be applied over the previous page version to reconstruct this version.
|
||||
///
|
||||
/// It's also possible to have both a WAL record and a page image in the same
|
||||
/// PageVersion. That happens if page version is originally stored as a WAL record
|
||||
/// but it is later reconstructed by a GetPage@LSN request by performing WAL
|
||||
/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
|
||||
/// the WAL record in that case. TODO: That's pretty accidental, not the result
|
||||
/// of any grand design. If we want to keep reconstructed page versions around, we
|
||||
/// probably should have a separate buffer cache so that we could control the
|
||||
/// replacement policy globally. Or if we keep a reconstructed page image, we
|
||||
/// could throw away the WAL record.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum PageVersion {
|
||||
Page(Bytes),
|
||||
Wal(WALRecord),
|
||||
pub struct PageVersion {
|
||||
/// an 8kb page image
|
||||
pub page_image: Option<Bytes>,
|
||||
/// WAL record to get from previous page version to this one.
|
||||
pub record: Option<WALRecord>,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -65,7 +79,7 @@ pub enum PageVersion {
|
||||
/// 'records' contains the records to apply over the base image.
|
||||
///
|
||||
pub struct PageReconstructData {
|
||||
pub records: Vec<(Lsn, WALRecord)>,
|
||||
pub records: Vec<WALRecord>,
|
||||
pub page_img: Option<Bytes>,
|
||||
}
|
||||
|
||||
@@ -73,15 +87,13 @@ pub struct PageReconstructData {
|
||||
pub enum PageReconstructResult {
|
||||
/// Got all the data needed to reconstruct the requested page
|
||||
Complete,
|
||||
/// This layer didn't contain all the required data, the caller should look up
|
||||
/// the predecessor layer at the returned LSN and collect more data from there.
|
||||
Continue(Lsn),
|
||||
/// This layer didn't contain all the required data, the caller should collect
|
||||
/// more data from the returned predecessor layer at the returned LSN.
|
||||
Continue(Lsn, Arc<dyn Layer>),
|
||||
/// This layer didn't contain data needed to reconstruct the page version at
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing(Lsn),
|
||||
/// Use the cached image at `cached_img_lsn` as the base image
|
||||
Cached,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -93,8 +105,6 @@ pub enum PageReconstructResult {
|
||||
/// in-memory and on-disk layers.
|
||||
///
|
||||
pub trait Layer: Send + Sync {
|
||||
fn get_tenant_id(&self) -> ZTenantId;
|
||||
|
||||
/// Identify the timeline this relish belongs to
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
|
||||
@@ -114,6 +124,10 @@ pub trait Layer: Send + Sync {
|
||||
/// Is the segment represented by this layer dropped by PostgreSQL?
|
||||
fn is_dropped(&self) -> bool;
|
||||
|
||||
/// Gets the physical location of the layer on disk.
|
||||
/// Some layers, such as in-memory, might not have the location.
|
||||
fn path(&self) -> Option<PathBuf>;
|
||||
|
||||
/// Filename used to store this layer on disk. (Even in-memory layers
|
||||
/// implement this, to print a handy unique identifier for the layer for
|
||||
/// log messages, even though they're never not on disk.)
|
||||
@@ -129,19 +143,15 @@ pub trait Layer: Send + Sync {
|
||||
/// of the *relish*, not the beginning of the segment. The requested
|
||||
/// 'blknum' must be covered by this segment.
|
||||
///
|
||||
/// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
|
||||
/// This function will only return data after `cached_img_lsn`.
|
||||
///
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call. If this returns PageReconstructResult::Continue, look up
|
||||
/// the predecessor layer and call again with the same 'reconstruct_data'
|
||||
/// on first call. If this returns PageReconstructResult::Continue, call
|
||||
/// again on the returned predecessor layer with the same 'reconstruct_data'
|
||||
/// to collect more data.
|
||||
fn get_page_reconstruct_data(
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult>;
|
||||
|
||||
@@ -157,9 +167,6 @@ pub trait Layer: Send + Sync {
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
|
||||
/// Returns true for layers that are represented in memory.
|
||||
fn is_in_memory(&self) -> bool;
|
||||
|
||||
/// Release memory used by this layer. There is no corresponding 'load'
|
||||
/// function, that's done implicitly when you call one of the get-functions.
|
||||
fn unload(&self) -> Result<()>;
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -13,15 +11,12 @@ pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod http;
|
||||
pub mod layered_repository;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod remote_storage;
|
||||
mod relish_storage;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_threads;
|
||||
pub mod virtual_file;
|
||||
pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
@@ -39,18 +34,12 @@ pub mod defaults {
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
|
||||
pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -79,10 +68,6 @@ pub struct PageServerConf {
|
||||
pub gc_period: Duration,
|
||||
pub superuser: String,
|
||||
|
||||
pub open_mem_limit: usize,
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
|
||||
// Repository directory, relative to current working directory.
|
||||
// Normally, the page server changes the current working directory
|
||||
// to the repository, and 'workdir' is always '.'. But we don't do
|
||||
@@ -96,7 +81,7 @@ pub struct PageServerConf {
|
||||
pub auth_type: AuthType,
|
||||
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
pub relish_storage_config: Option<RelishStorageConfig>,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
@@ -105,7 +90,7 @@ impl PageServerConf {
|
||||
//
|
||||
|
||||
fn tenants_path(&self) -> PathBuf {
|
||||
self.workdir.join(TENANTS_SEGMENT_NAME)
|
||||
self.workdir.join("tenants")
|
||||
}
|
||||
|
||||
fn tenant_path(&self, tenantid: &ZTenantId) -> PathBuf {
|
||||
@@ -129,13 +114,21 @@ impl PageServerConf {
|
||||
}
|
||||
|
||||
fn timelines_path(&self, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.tenant_path(tenantid).join(TIMELINES_SEGMENT_NAME)
|
||||
self.tenant_path(tenantid).join("timelines")
|
||||
}
|
||||
|
||||
fn timeline_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timelines_path(tenantid).join(timelineid.to_string())
|
||||
}
|
||||
|
||||
fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timeline_path(timelineid, tenantid).join("ancestor")
|
||||
}
|
||||
|
||||
fn wal_dir_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timeline_path(timelineid, tenantid).join("wal")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
@@ -161,9 +154,6 @@ impl PageServerConf {
|
||||
checkpoint_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
|
||||
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
|
||||
max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
superuser: "zenith_admin".to_string(),
|
||||
@@ -171,55 +161,25 @@ impl PageServerConf {
|
||||
pg_distrib_dir: "".into(),
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
relish_storage_config: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Config for the Repository checkpointer
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum CheckpointConfig {
|
||||
// Flush in-memory data that is older than this
|
||||
Distance(u64),
|
||||
// Flush all in-memory data
|
||||
Forced,
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
/// External relish storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between pageserver and the remote storage.
|
||||
pub max_concurrent_sync: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored relish data into.
|
||||
pub enum RelishStorageConfig {
|
||||
/// Root folder to place all stored relish data into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all relishes into the root
|
||||
/// of the S3 bucket from the config.
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
}
|
||||
|
||||
|
||||
@@ -1,778 +0,0 @@
|
||||
//!
|
||||
//! Global page cache
|
||||
//!
|
||||
//! The page cache uses up most of the memory in the page server. It is shared
|
||||
//! by all tenants, and it is used to store different kinds of pages. Sharing
|
||||
//! the cache allows memory to be dynamically allocated where it's needed the
|
||||
//! most.
|
||||
//!
|
||||
//! The page cache consists of fixed-size buffers, 8 kB each to match the
|
||||
//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
|
||||
//! information about what's stored in the buffer.
|
||||
//!
|
||||
//! # Locking
|
||||
//!
|
||||
//! There are two levels of locking involved: There's one lock for the "mapping"
|
||||
//! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
|
||||
//! slot, and a separate lock on each slot. To read or write the contents of a
|
||||
//! slot, you must hold the lock on the slot in read or write mode,
|
||||
//! respectively. To change the mapping of a slot, i.e. to evict a page or to
|
||||
//! assign a buffer for a page, you must hold the mapping lock and the lock on
|
||||
//! the slot at the same time.
|
||||
//!
|
||||
//! Whenever you need to hold both locks simultenously, the slot lock must be
|
||||
//! acquired first. This consistent ordering avoids deadlocks. To look up a page
|
||||
//! in the cache, you would first look up the mapping, while holding the mapping
|
||||
//! lock, and then lock the slot. You must release the mapping lock in between,
|
||||
//! to obey the lock ordering and avoid deadlock.
|
||||
//!
|
||||
//! A slot can momentarily have invalid contents, even if it's already been
|
||||
//! inserted to the mapping, but you must hold the write-lock on the slot until
|
||||
//! the contents are valid. If you need to release the lock without initializing
|
||||
//! the contents, you must remove the mapping first. We make that easy for the
|
||||
//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
|
||||
//! page, the caller must explicitly call guard.mark_valid() after it has
|
||||
//! initialized it. If the guard is dropped without calling mark_valid(), the
|
||||
//! mapping is automatically removed and the slot is marked free.
|
||||
//!
|
||||
|
||||
use std::{
|
||||
collections::{hash_map::Entry, HashMap},
|
||||
convert::TryInto,
|
||||
sync::{
|
||||
atomic::{AtomicU8, AtomicUsize, Ordering},
|
||||
RwLock, RwLockReadGuard, RwLockWriteGuard,
|
||||
},
|
||||
};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::layered_repository::writeback_ephemeral_file;
|
||||
use crate::{relish::RelTag, PageServerConf};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 10;
|
||||
|
||||
///
|
||||
/// Initialize the page cache. This must be called once at page server startup.
|
||||
///
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
if PAGE_CACHE
|
||||
.set(PageCache::new(conf.page_cache_size))
|
||||
.is_err()
|
||||
{
|
||||
panic!("page cache already initialized");
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the page cache.
|
||||
///
|
||||
pub fn get() -> &'static PageCache {
|
||||
//
|
||||
// In unit tests, page server startup doesn't happen and no one calls
|
||||
// page_cache::init(). Initialize it here with a tiny cache, so that the
|
||||
// page cache is usable in unit tests.
|
||||
//
|
||||
if cfg!(test) {
|
||||
PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
|
||||
} else {
|
||||
PAGE_CACHE.get().expect("page cache not initialized")
|
||||
}
|
||||
}
|
||||
|
||||
pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize;
|
||||
const MAX_USAGE_COUNT: u8 = 5;
|
||||
|
||||
///
|
||||
/// CacheKey uniquely identifies a "thing" to cache in the page cache.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
enum CacheKey {
|
||||
MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey,
|
||||
lsn: Lsn,
|
||||
},
|
||||
EphemeralPage {
|
||||
file_id: u64,
|
||||
blkno: u32,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
struct MaterializedPageHashKey {
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Version {
|
||||
lsn: Lsn,
|
||||
slot_idx: usize,
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
inner: RwLock<SlotInner>,
|
||||
usage_count: AtomicU8,
|
||||
}
|
||||
|
||||
struct SlotInner {
|
||||
key: Option<CacheKey>,
|
||||
buf: &'static mut [u8; PAGE_SZ],
|
||||
dirty: bool,
|
||||
}
|
||||
|
||||
impl Slot {
|
||||
/// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
|
||||
fn inc_usage_count(&self) {
|
||||
let _ = self
|
||||
.usage_count
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
|
||||
if val == MAX_USAGE_COUNT {
|
||||
None
|
||||
} else {
|
||||
Some(val + 1)
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Decrement usage count on the buffer, unless it's already zero. Returns
|
||||
/// the old usage count.
|
||||
fn dec_usage_count(&self) -> u8 {
|
||||
let count_res =
|
||||
self.usage_count
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
|
||||
if val == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(val - 1)
|
||||
}
|
||||
});
|
||||
|
||||
match count_res {
|
||||
Ok(usage_count) => usage_count,
|
||||
Err(usage_count) => usage_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageCache {
|
||||
/// This contains the mapping from the cache key to buffer slot that currently
|
||||
/// contains the page, if any.
|
||||
///
|
||||
/// TODO: This is protected by a single lock. If that becomes a bottleneck,
|
||||
/// this HashMap can be replaced with a more concurrent version, there are
|
||||
/// plenty of such crates around.
|
||||
///
|
||||
/// If you add support for caching different kinds of objects, each object kind
|
||||
/// can have a separate mapping map, next to this field.
|
||||
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
|
||||
ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
||||
|
||||
/// The actual buffers with their metadata.
|
||||
slots: Box<[Slot]>,
|
||||
|
||||
/// Index of the next candidate to evict, for the Clock replacement algorithm.
|
||||
/// This is interpreted modulo the page cache size.
|
||||
next_evict_slot: AtomicUsize,
|
||||
}
|
||||
|
||||
///
|
||||
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
|
||||
/// until the guard is dropped.
|
||||
///
|
||||
pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
|
||||
|
||||
impl std::ops::Deref for PageReadGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.buf
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
|
||||
/// until the guard is dropped.
|
||||
///
|
||||
/// Counterintuitively, this is used even for a read, if the requested page is not
|
||||
/// currently found in the page cache. In that case, the caller of lock_for_read()
|
||||
/// is expected to fill in the page contents and call mark_valid(). Similarly
|
||||
/// lock_for_write() can return an invalid buffer that the caller is expected to
|
||||
/// to initialize.
|
||||
///
|
||||
pub struct PageWriteGuard<'i> {
|
||||
inner: RwLockWriteGuard<'i, SlotInner>,
|
||||
|
||||
// Are the page contents currently valid?
|
||||
valid: bool,
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for PageWriteGuard<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.inner.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for PageWriteGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.inner.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl PageWriteGuard<'_> {
|
||||
/// Mark that the buffer contents are now valid.
|
||||
pub fn mark_valid(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
assert!(
|
||||
!self.valid,
|
||||
"mark_valid called on a buffer that was already valid"
|
||||
);
|
||||
self.valid = true;
|
||||
}
|
||||
pub fn mark_dirty(&mut self) {
|
||||
// only ephemeral pages can be dirty ATM.
|
||||
assert!(matches!(
|
||||
self.inner.key,
|
||||
Some(CacheKey::EphemeralPage { .. })
|
||||
));
|
||||
self.inner.dirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageWriteGuard<'_> {
|
||||
///
|
||||
/// If the buffer was allocated for a page that was not already in the
|
||||
/// cache, but the lock_for_read/write() caller dropped the buffer without
|
||||
/// initializing it, remove the mapping from the page cache.
|
||||
///
|
||||
fn drop(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
if !self.valid {
|
||||
let self_key = self.inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
self.inner.key = None;
|
||||
self.inner.dirty = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// lock_for_read() return value
|
||||
pub enum ReadBufResult<'a> {
|
||||
Found(PageReadGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
/// lock_for_write() return value
|
||||
pub enum WriteBufResult<'a> {
|
||||
Found(PageWriteGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
impl PageCache {
|
||||
//
|
||||
// Section 1.1: Public interface functions for looking up and memorizing materialized page
|
||||
// versions in the page cache
|
||||
//
|
||||
|
||||
/// Look up a materialized page version.
|
||||
///
|
||||
/// The 'lsn' is an upper bound, this will return the latest version of
|
||||
/// the given block, but not newer than 'lsn'. Returns the actual LSN of the
|
||||
/// returned page.
|
||||
pub fn lookup_materialized_page(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
) -> Option<(Lsn, PageReadGuard)> {
|
||||
let mut cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
rel_tag,
|
||||
blknum,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
|
||||
if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
|
||||
if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
|
||||
Some((lsn, guard))
|
||||
} else {
|
||||
panic!("unexpected key type in slot");
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Store an image of the given page in the cache.
|
||||
///
|
||||
pub fn memorize_materialized_page(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
img: &[u8],
|
||||
) {
|
||||
let cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
rel_tag,
|
||||
blknum,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
|
||||
match self.lock_for_write(&cache_key) {
|
||||
WriteBufResult::Found(write_guard) => {
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(*write_guard == img);
|
||||
}
|
||||
WriteBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(img);
|
||||
write_guard.mark_valid();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with Ephemeral pages.
|
||||
|
||||
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
|
||||
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
|
||||
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_write(&cache_key)
|
||||
}
|
||||
|
||||
/// Immediately drop all buffers belonging to given file, without writeback
|
||||
pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
|
||||
for slot_idx in 0..self.slots.len() {
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
if let Some(key) = &inner.key {
|
||||
match key {
|
||||
CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
inner.dirty = false;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Section 2: Internal interface functions for lookup/update.
|
||||
//
|
||||
// To add support for a new kind of "thing" to cache, you will need
|
||||
// to add public interface routines above, and code to deal with the
|
||||
// "mappings" after this section. But the routines in this section should
|
||||
// not require changes.
|
||||
|
||||
/// Look up a page in the cache.
|
||||
///
|
||||
/// If the search criteria is not exact, *cache_key is updated with the key
|
||||
/// for exact key of the returned page. (For materialized pages, that means
|
||||
/// that the LSN in 'cache_key' is updated with the LSN of the returned page
|
||||
/// version.)
|
||||
///
|
||||
/// If no page is found, returns None and *cache_key is left unmodified.
|
||||
///
|
||||
fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
|
||||
let cache_key_orig = cache_key.clone();
|
||||
if let Some(slot_idx) = self.search_mapping(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.read().unwrap();
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
return Some(PageReadGuard(inner));
|
||||
} else {
|
||||
// search_mapping might have modified the search key; restore it.
|
||||
*cache_key = cache_key_orig;
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Return a locked buffer for given block.
|
||||
///
|
||||
/// Like try_lock_for_read(), if the search criteria is not exact and the
|
||||
/// page is already found in the cache, *cache_key is updated.
|
||||
///
|
||||
/// If the page is not found in the cache, this allocates a new buffer for
|
||||
/// it. The caller may then initialize the buffer with the contents, and
|
||||
/// call mark_valid().
|
||||
///
|
||||
/// Example usage:
|
||||
///
|
||||
/// ```ignore
|
||||
/// let cache = page_cache::get();
|
||||
///
|
||||
/// match cache.lock_for_read(&key) {
|
||||
/// ReadBufResult::Found(read_guard) => {
|
||||
/// // The page was found in cache. Use it
|
||||
/// },
|
||||
/// ReadBufResult::NotFound(write_guard) => {
|
||||
/// // The page was not found in cache. Read it from disk into the
|
||||
/// // buffer.
|
||||
/// //read_my_page_from_disk(write_guard);
|
||||
///
|
||||
/// // The buffer contents are now valid. Tell the page cache.
|
||||
/// write_guard.mark_valid();
|
||||
/// },
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
|
||||
return ReadBufResult::Found(read_guard);
|
||||
}
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self.find_victim();
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return ReadBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
valid: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up a page in the cache and lock it in write mode. If it's not
|
||||
/// found, returns None.
|
||||
///
|
||||
/// When locking a page for writing, the search criteria is always "exact".
|
||||
fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
|
||||
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().unwrap();
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
return Some(PageWriteGuard { inner, valid: true });
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Return a write-locked buffer for given block.
|
||||
///
|
||||
/// Similar to lock_for_read(), but the returned buffer is write-locked and
|
||||
/// may be modified by the caller even if it's already found in the cache.
|
||||
fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key) {
|
||||
return WriteBufResult::Found(write_guard);
|
||||
}
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self.find_victim();
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return WriteBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
valid: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Section 3: Mapping functions
|
||||
//
|
||||
|
||||
/// Search for a page in the cache using the given search key.
|
||||
///
|
||||
/// Returns the slot index, if any. If the search criteria is not exact,
|
||||
/// *cache_key is updated with the actual key of the found page.
|
||||
///
|
||||
/// NOTE: We don't hold any lock on the mapping on return, so the slot might
|
||||
/// get recycled for an unrelated page immediately after this function
|
||||
/// returns. The caller is responsible for re-checking that the slot still
|
||||
/// contains the page with the same key before using it.
|
||||
///
|
||||
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
Ok(version_idx) => version_idx,
|
||||
Err(0) => return None,
|
||||
Err(version_idx) => version_idx - 1,
|
||||
};
|
||||
let version = &versions[version_idx];
|
||||
*lsn = version.lsn;
|
||||
Some(version.slot_idx)
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Search for a page in the cache using the given search key.
|
||||
///
|
||||
/// Like 'search_mapping, but performs an "exact" search. Used for
|
||||
/// allocating a new buffer.
|
||||
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
||||
match key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
Some(versions[version_idx].slot_idx)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Remove mapping for given key.
|
||||
///
|
||||
fn remove_mapping(&self, old_key: &CacheKey) {
|
||||
match old_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: old_hash_key,
|
||||
lsn: old_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
|
||||
let versions = old_entry.get_mut();
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
|
||||
versions.remove(version_idx);
|
||||
if versions.is_empty() {
|
||||
old_entry.remove_entry();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
panic!("could not find old key in mapping")
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Insert mapping for given key.
|
||||
///
|
||||
/// If a mapping already existed for the given key, returns the slot index
|
||||
/// of the existing mapping and leaves it untouched.
|
||||
fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
|
||||
match new_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: new_key,
|
||||
lsn: new_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
let versions = map.entry(new_key.clone()).or_default();
|
||||
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
|
||||
Ok(version_idx) => Some(versions[version_idx].slot_idx),
|
||||
Err(version_idx) => {
|
||||
versions.insert(
|
||||
version_idx,
|
||||
Version {
|
||||
lsn: *new_lsn,
|
||||
slot_idx,
|
||||
},
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(slot_idx);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Section 4: Misc internal helpers
|
||||
//
|
||||
|
||||
/// Find a slot to evict.
|
||||
///
|
||||
/// On return, the slot is empty and write-locked.
|
||||
fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
|
||||
let iter_limit = self.slots.len() * 2;
|
||||
let mut iters = 0;
|
||||
loop {
|
||||
let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
|
||||
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
if slot.dec_usage_count() == 0 || iters >= iter_limit {
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
|
||||
if let Some(old_key) = &inner.key {
|
||||
if inner.dirty {
|
||||
if let Err(err) = Self::writeback(old_key, inner.buf) {
|
||||
// Writing the page to disk failed.
|
||||
//
|
||||
// FIXME: What to do here, when? We could propagate the error to the
|
||||
// caller, but victim buffer is generally unrelated to the original
|
||||
// call. It can even belong to a different tenant. Currently, we
|
||||
// report the error to the log and continue the clock sweep to find
|
||||
// a different victim. But if the problem persists, the page cache
|
||||
// could fill up with dirty pages that we cannot evict, and we will
|
||||
// loop retrying the writebacks indefinitely.
|
||||
error!("writeback of buffer {:?} failed: {}", old_key, err);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(old_key);
|
||||
inner.dirty = false;
|
||||
inner.key = None;
|
||||
}
|
||||
return (slot_idx, inner);
|
||||
}
|
||||
|
||||
iters += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: _,
|
||||
lsn: _,
|
||||
} => {
|
||||
panic!("unexpected dirty materialized page");
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
writeback_ephemeral_file(*file_id, *blkno, buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize a new page cache
|
||||
///
|
||||
/// This should be called only once at page server startup.
|
||||
fn new(num_pages: usize) -> Self {
|
||||
assert!(num_pages > 0, "page cache size must be > 0");
|
||||
|
||||
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
|
||||
|
||||
let slots = page_buffer
|
||||
.chunks_exact_mut(PAGE_SZ)
|
||||
.map(|chunk| {
|
||||
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
|
||||
|
||||
Slot {
|
||||
inner: RwLock::new(SlotInner {
|
||||
key: None,
|
||||
buf,
|
||||
dirty: false,
|
||||
}),
|
||||
usage_count: AtomicU8::new(0),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
materialized_page_map: Default::default(),
|
||||
ephemeral_page_map: Default::default(),
|
||||
slots,
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,9 +10,10 @@
|
||||
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use regex::Regex;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
@@ -20,12 +21,10 @@ use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
use std::{io, net::TcpStream};
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::auth::{self, JwtAuth};
|
||||
use zenith_utils::auth::{Claims, Scope};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::is_socket_read_timed_out;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::postgres_backend::{self, AuthType};
|
||||
use zenith_utils::pq_proto::{
|
||||
@@ -188,32 +187,17 @@ pub fn thread_main(
|
||||
listener: TcpListener,
|
||||
auth_type: AuthType,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
loop {
|
||||
let (socket, peer_addr) = listener.accept()?;
|
||||
debug!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
let local_auth = auth.clone();
|
||||
|
||||
let handle = thread::Builder::new()
|
||||
.name("serving Page Service thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!(%err, "page server thread exited with error");
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
join_handles.push(handle);
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
|
||||
error!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
debug!("page_service loop terminated. wait for connections to cancel");
|
||||
for handle in join_handles.into_iter() {
|
||||
handle.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn page_service_conn_main(
|
||||
@@ -232,7 +216,7 @@ fn page_service_conn_main(
|
||||
}
|
||||
|
||||
let mut conn_handler = PageServerHandler::new(conf, auth);
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
|
||||
let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
@@ -276,66 +260,50 @@ impl PageServerHandler {
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
match pgb.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(message) = message {
|
||||
trace!("query: {:?}", message);
|
||||
while let Some(message) = pgb.read_message()? {
|
||||
trace!("query({:?}): {:?}", timelineid, message);
|
||||
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
let zenith_fe_msg = PagestreamFeMessage::parse(copy_data_bytes)?;
|
||||
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_nblocks_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
}),
|
||||
};
|
||||
let response = match zenith_fe_msg {
|
||||
PagestreamFeMessage::Exists(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_exists"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_rel_exists_request(&*timeline, &req)
|
||||
}),
|
||||
PagestreamFeMessage::Nblocks(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_rel_size"])
|
||||
.observe_closure_duration(|| self.handle_get_nblocks_request(&*timeline, &req)),
|
||||
PagestreamFeMessage::GetPage(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_page_at_lsn"])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(&*timeline, &req)
|
||||
}),
|
||||
};
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
error!("error reading relation or page version: {:#}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
// print the all details to the log with {:#}, but for the client the
|
||||
// error message is enough
|
||||
error!("error reading relation or page version: {:#}", e);
|
||||
PagestreamBeMessage::Error(PagestreamErrorResponse {
|
||||
message: e.to_string(),
|
||||
})
|
||||
});
|
||||
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_socket_read_timed_out(&e) {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
pgb.write_message(&BeMessage::CopyData(&response.serialize()))?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -395,8 +363,6 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamExistsRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_rel_exists", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -412,7 +378,6 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamNblocksRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_nblocks", rel = %req.rel, req_lsn = %req.lsn).entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -432,8 +397,6 @@ impl PageServerHandler {
|
||||
timeline: &dyn Timeline,
|
||||
req: &PagestreamGetPageRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_page", rel = %req.rel, blkno = &req.blkno, req_lsn = %req.lsn)
|
||||
.entered();
|
||||
let tag = RelishTag::Relation(req.rel);
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest)?;
|
||||
|
||||
@@ -451,25 +414,17 @@ impl PageServerHandler {
|
||||
lsn: Option<Lsn>,
|
||||
tenantid: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
let span = info_span!("basebackup", timeline = %timelineid, tenant = %tenantid, lsn = field::Empty);
|
||||
let _enter = span.enter();
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
if let Some(lsn) = lsn {
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
}
|
||||
|
||||
// switch client to COPYOUT
|
||||
/* switch client to COPYOUT */
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
info!("sent CopyOut");
|
||||
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
{
|
||||
let mut writer = CopyDataSink { pgb };
|
||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()?;
|
||||
}
|
||||
pgb.write_message(&BeMessage::CopyDone)?;
|
||||
@@ -574,6 +529,11 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
None
|
||||
};
|
||||
|
||||
info!(
|
||||
"got basebackup command. tenantid=\"{}\" timelineid=\"{}\" lsn=\"{:#?}\"",
|
||||
tenantid, timelineid, lsn
|
||||
);
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid)?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
@@ -591,9 +551,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
|
||||
@@ -616,9 +573,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
let _enter =
|
||||
info_span!("branch_create", name = %branchname, tenant = %tenantid).entered();
|
||||
|
||||
let branch =
|
||||
branches::create_branch(self.conf, &branchname, &startpoint_str, &tenantid)?;
|
||||
let branch = serde_json::to_vec(&branch)?;
|
||||
@@ -635,16 +589,14 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
|
||||
// since these handlers for tenant/branch commands are deprecated (in favor of http based ones)
|
||||
// just use false in place of include non incremental logical size
|
||||
let branches = crate::branches::get_branches(self.conf, &tenantid, false)?;
|
||||
let branches = crate::branches::get_branches(self.conf, &tenantid)?;
|
||||
let branches_buf = serde_json::to_vec(&branches)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(&branches_buf)]))?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("tenant_list") {
|
||||
let tenants = crate::tenant_mgr::list_tenants()?;
|
||||
let tenants = crate::branches::get_tenants(self.conf)?;
|
||||
let tenants_buf = serde_json::to_vec(&tenants)?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
@@ -696,7 +648,9 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.unwrap_or(Ok(self.conf.gc_horizon))?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layer_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
|
||||
|
||||
54
pageserver/src/relish_storage.rs
Normal file
54
pageserver/src/relish_storage.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
//! Abstractions for the page server to store its relish layer data in the external storage.
|
||||
//!
|
||||
//! Main purpose of this module subtree is to provide a set of abstractions to manage the storage state
|
||||
//! in a way, optimal for page server.
|
||||
//!
|
||||
//! The abstractions hide multiple custom external storage API implementations,
|
||||
//! such as AWS S3, local filesystem, etc., located in the submodules.
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
/// A queue and the background machinery behind it to upload
|
||||
/// local page server layer files to external storage.
|
||||
pub mod storage_uploader;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RelishStorage: Send + Sync {
|
||||
type RelishStoragePath;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath>;
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
|
||||
|
||||
async fn download_relish(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
to: &Path,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
|
||||
|
||||
async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
fn strip_workspace_prefix<'a>(
|
||||
page_server_workdir: &'a Path,
|
||||
relish_local_path: &'a Path,
|
||||
) -> anyhow::Result<&'a Path> {
|
||||
relish_local_path
|
||||
.strip_prefix(page_server_workdir)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Unexpected: relish local path '{}' is not relevant to server workdir",
|
||||
relish_local_path.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
158
pageserver/src/relish_storage/local_fs.rs
Normal file
158
pageserver/src/relish_storage/local_fs.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
//! Local filesystem relish storage.
|
||||
//!
|
||||
//! Page server already stores layer data on the server, when freezing it.
|
||||
//! This storage serves a way to
|
||||
//!
|
||||
//! * test things locally simply
|
||||
//! * allow to compabre both binary sets
|
||||
//! * help validating the relish storage API
|
||||
|
||||
use std::{
|
||||
future::Future,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
|
||||
use super::{strip_workspace_prefix, RelishStorage};
|
||||
|
||||
pub struct LocalFs {
|
||||
root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Atetmpts to create local FS relish storage, also creates the directory provided, if not exists.
|
||||
pub fn new(root: PathBuf) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
format!(
|
||||
"Failed to create all directories in the given root path {}",
|
||||
root.display(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok(Self { root })
|
||||
}
|
||||
|
||||
fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
|
||||
if path.is_relative() {
|
||||
Ok(self.root.join(path))
|
||||
} else if path.starts_with(&self.root) {
|
||||
Ok(path.to_path_buf())
|
||||
} else {
|
||||
bail!(
|
||||
"Path '{}' does not belong to the current storage",
|
||||
path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RelishStorage for LocalFs {
|
||||
type RelishStoragePath = PathBuf;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
Ok(strip_workspace_prefix(page_server_workdir, relish_local_path)?.to_path_buf())
|
||||
}
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
Ok(get_all_files(&self.root).await?.into_iter().collect())
|
||||
}
|
||||
|
||||
async fn download_relish(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
to: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
create_target_directory(to).await?;
|
||||
tokio::fs::copy(file_path, to).await?;
|
||||
Ok(())
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(tokio::fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
create_target_directory(&target_file_path).await?;
|
||||
|
||||
tokio::fs::copy(&from, &target_file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload relish '{}' to local storage",
|
||||
from.display(),
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = tokio::fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path = dir_entry.path();
|
||||
if file_type.is_symlink() {
|
||||
log::debug!("{:?} us a symlink, skipping", entry_path)
|
||||
} else if file_type.is_dir() {
|
||||
paths.extend(get_all_files(entry_path).await?.into_iter())
|
||||
} else {
|
||||
paths.push(dir_entry.path());
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path '{}' is not a directory", directory_path.display())
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
None => bail!(
|
||||
"Relish path '{}' has no parent directory",
|
||||
target_file_path.display()
|
||||
),
|
||||
};
|
||||
if !target_dir.exists() {
|
||||
tokio::fs::create_dir_all(target_dir).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
144
pageserver/src/relish_storage/rust_s3.rs
Normal file
144
pageserver/src/relish_storage/rust_s3.rs
Normal file
@@ -0,0 +1,144 @@
|
||||
//! A wrapper around AWS S3 client library `rust_s3` to be used a relish storage.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
|
||||
use crate::{relish_storage::strip_workspace_prefix, S3Config};
|
||||
|
||||
use super::RelishStorage;
|
||||
|
||||
const S3_FILE_SEPARATOR: char = '/';
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct S3ObjectKey(String);
|
||||
|
||||
impl S3ObjectKey {
|
||||
fn key(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 relish storage.
|
||||
pub struct RustS3 {
|
||||
bucket: Bucket,
|
||||
}
|
||||
|
||||
impl RustS3 {
|
||||
/// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
|
||||
let region = aws_config
|
||||
.bucket_region
|
||||
.parse::<Region>()
|
||||
.context("Failed to parse the s3 region from config")?;
|
||||
let credentials = Credentials::new(
|
||||
aws_config.access_key_id.as_deref(),
|
||||
aws_config.secret_access_key.as_deref(),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.context("Failed to create the s3 credentials")?;
|
||||
Ok(Self {
|
||||
bucket: Bucket::new_with_path_style(
|
||||
aws_config.bucket_name.as_str(),
|
||||
region,
|
||||
credentials,
|
||||
)
|
||||
.context("Failed to create the s3 bucket")?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RelishStorage for RustS3 {
|
||||
type RelishStoragePath = S3ObjectKey;
|
||||
|
||||
fn derive_destination(
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
let relative_path = strip_workspace_prefix(page_server_workdir, relish_local_path)?;
|
||||
let mut key = String::new();
|
||||
for segment in relative_path {
|
||||
key.push(S3_FILE_SEPARATOR);
|
||||
key.push_str(&segment.to_string_lossy());
|
||||
}
|
||||
Ok(S3ObjectKey(key))
|
||||
}
|
||||
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
let list_response = self
|
||||
.bucket
|
||||
.list(String::new(), None)
|
||||
.await
|
||||
.context("Failed to list s3 objects")?;
|
||||
|
||||
Ok(list_response
|
||||
.into_iter()
|
||||
.flat_map(|response| response.contents)
|
||||
.map(|s3_object| S3ObjectKey(s3_object.key))
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn download_relish(
|
||||
&self,
|
||||
from: &Self::RelishStoragePath,
|
||||
to: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut target_file = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.open(to)
|
||||
.with_context(|| format!("Failed to open target s3 destination at {}", to.display()))?;
|
||||
let code = self
|
||||
.bucket
|
||||
.get_object_stream(from.key(), &mut target_file)
|
||||
.await
|
||||
.with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during downloading object from directory, code: {}",
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let (_, code) = self
|
||||
.bucket
|
||||
.delete_object(path.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during deleting object with key '{}', code: {}",
|
||||
path.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish(&self, from: &Path, to: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let mut local_file = tokio::fs::OpenOptions::new().read(true).open(from).await?;
|
||||
|
||||
let code = self
|
||||
.bucket
|
||||
.put_object_stream(&mut local_file, to.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during creating object with key '{}', code: {}",
|
||||
to.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
116
pageserver/src/relish_storage/storage_uploader.rs
Normal file
116
pageserver/src/relish_storage/storage_uploader.rs
Normal file
@@ -0,0 +1,116 @@
|
||||
use std::{
|
||||
collections::VecDeque,
|
||||
path::{Path, PathBuf},
|
||||
sync::{Arc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::{relish_storage::RelishStorage, RelishStorageConfig};
|
||||
|
||||
use super::{local_fs::LocalFs, rust_s3::RustS3};
|
||||
|
||||
pub struct QueueBasedRelishUploader {
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
}
|
||||
|
||||
impl QueueBasedRelishUploader {
|
||||
pub fn new(
|
||||
config: &RelishStorageConfig,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
let upload_queue = Arc::new(Mutex::new(VecDeque::new()));
|
||||
let _handle = match config {
|
||||
RelishStorageConfig::LocalFs(root) => {
|
||||
let relish_storage = LocalFs::new(root.clone())?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
RelishStorageConfig::AwsS3(s3_config) => {
|
||||
let relish_storage = RustS3::new(s3_config)?;
|
||||
create_upload_thread(
|
||||
Arc::clone(&upload_queue),
|
||||
relish_storage,
|
||||
page_server_workdir,
|
||||
)?
|
||||
}
|
||||
};
|
||||
|
||||
Ok(Self { upload_queue })
|
||||
}
|
||||
|
||||
pub fn schedule_upload(&self, timeline_id: ZTimelineId, relish_path: PathBuf) {
|
||||
self.upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((timeline_id, relish_path))
|
||||
}
|
||||
}
|
||||
|
||||
fn create_upload_thread<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: Arc<Mutex<VecDeque<(ZTimelineId, PathBuf)>>>,
|
||||
relish_storage: S,
|
||||
page_server_workdir: &'static Path,
|
||||
) -> std::io::Result<thread::JoinHandle<()>> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
thread::Builder::new()
|
||||
.name("Queue based relish uploader".to_string())
|
||||
.spawn(move || loop {
|
||||
runtime.block_on(async {
|
||||
upload_loop_step(&upload_queue, &relish_storage, page_server_workdir).await;
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
async fn upload_loop_step<P, S: 'static + RelishStorage<RelishStoragePath = P>>(
|
||||
upload_queue: &Mutex<VecDeque<(ZTimelineId, PathBuf)>>,
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
) {
|
||||
let mut queue_accessor = upload_queue.lock().unwrap();
|
||||
log::debug!("current upload queue length: {}", queue_accessor.len());
|
||||
let next_upload = queue_accessor.pop_front();
|
||||
drop(queue_accessor);
|
||||
|
||||
let (relish_timeline_id, relish_local_path) = match next_upload {
|
||||
Some(data) => data,
|
||||
None => {
|
||||
// Don't spin and allow others to use the queue.
|
||||
// In future, could be improved to be more clever about delays depending on relish upload stats
|
||||
thread::sleep(std::time::Duration::from_secs(1));
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = upload_relish(relish_storage, page_server_workdir, &relish_local_path).await {
|
||||
log::error!(
|
||||
"Failed to upload relish '{}' for timeline {}, reason: {}",
|
||||
relish_local_path.display(),
|
||||
relish_timeline_id,
|
||||
e
|
||||
);
|
||||
upload_queue
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push_back((relish_timeline_id, relish_local_path))
|
||||
} else {
|
||||
log::debug!("Relish successfully uploaded");
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<P, S: RelishStorage<RelishStoragePath = P>>(
|
||||
relish_storage: &S,
|
||||
page_server_workdir: &Path,
|
||||
relish_local_path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let destination = S::derive_destination(page_server_workdir, relish_local_path)?;
|
||||
relish_storage
|
||||
.upload_relish(relish_local_path, &destination)
|
||||
.await
|
||||
}
|
||||
@@ -1,182 +0,0 @@
|
||||
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
|
||||
//! This particular module serves as a public API border between pageserver and the internal storage machinery.
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! There are a few components the storage machinery consists of:
|
||||
//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
|
||||
//!
|
||||
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
|
||||
//!
|
||||
//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_checkpoint_upload`]
|
||||
//!
|
||||
//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
|
||||
//!
|
||||
//! +------------------------+ +--------->-------+
|
||||
//! | | - - - (init async loop) - - - -> | |
|
||||
//! | | | |
|
||||
//! | | -------------------------------> | async |
|
||||
//! | pageserver | (schedule checkpoint upload) | upload/download |
|
||||
//! | | | loop |
|
||||
//! | | <------------------------------- | |
|
||||
//! | | (register downloaded layers) | |
|
||||
//! +------------------------+ +---------<-------+
|
||||
//! |
|
||||
//! |
|
||||
//! CRUD layer file operations |
|
||||
//! (upload/download/delete/list, etc.) |
|
||||
//! V
|
||||
//! +------------------------+
|
||||
//! | |
|
||||
//! | [`RemoteStorage`] impl |
|
||||
//! | |
|
||||
//! | pageserver assumes it |
|
||||
//! | owns exclusive write |
|
||||
//! | access to this storage |
|
||||
//! +------------------------+
|
||||
//!
|
||||
//! First, during startup, the pageserver inits the storage sync thread with the async loop, or leaves the loop unitialised, if configured so.
|
||||
//! Some time later, during pageserver checkpoints, in-memory data is flushed onto disk along with its metadata.
|
||||
//! If the storage sync loop was successfully started before, pageserver schedules the new image uploads after every checkpoint.
|
||||
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
|
||||
//!
|
||||
//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
|
||||
//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
|
||||
//! by the storage upload, if enabled.
|
||||
//! When a certain image gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same image state.
|
||||
//! No files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
|
||||
//! when the newer timeline is downloaded.
|
||||
//!
|
||||
//! Meanwhile, the loop inits the storage connection and checks the remote files stored.
|
||||
//! This is done once at startup only, relying on the fact that pageserver uses the storage alone (ergo, nobody else uploads the files to the storage but this server).
|
||||
//! Based on the remote image data, the storage sync logic queues image downloads, while accepting any potential upload tasks from pageserver and managing the tasks by their priority.
|
||||
//! On the image download, a [`crate::tenant_mgr::register_relish_download`] function is called to register the new image in pageserver, initializing all related threads and internal state.
|
||||
//!
|
||||
//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
|
||||
//!
|
||||
//! NOTES:
|
||||
//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
|
||||
//!
|
||||
//! * the uploads do not happen right after pageserver startup, they are registered when
|
||||
//! 1. pageserver does the checkpoint, which happens further in the future after the server start
|
||||
//! 2. pageserver loads the timeline from disk for the first time
|
||||
//!
|
||||
//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
|
||||
//!
|
||||
//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of remote files,
|
||||
//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
|
||||
//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
|
||||
|
||||
mod local_fs;
|
||||
mod rust_s3;
|
||||
mod storage_sync;
|
||||
|
||||
use std::{
|
||||
path::{Path, PathBuf},
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::io;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
pub use self::storage_sync::schedule_timeline_checkpoint_upload;
|
||||
use self::{local_fs::LocalFs, rust_s3::S3};
|
||||
use crate::{PageServerConf, RemoteStorageKind};
|
||||
|
||||
/// Any timeline has its own id and its own tenant it belongs to,
|
||||
/// the sync processes group timelines by both for simplicity.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TimelineSyncId(ZTenantId, ZTimelineId);
|
||||
|
||||
/// Based on the config, initiates the remote storage connection and starts a separate thread
|
||||
/// that ensures that pageserver and the remote storage are in sync with each other.
|
||||
/// If no external configuraion connection given, no thread or storage initialization is done.
|
||||
pub fn run_storage_sync_thread(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
match &config.remote_storage_config {
|
||||
Some(storage_config) => {
|
||||
let max_concurrent_sync = storage_config.max_concurrent_sync;
|
||||
let max_sync_errors = storage_config.max_sync_errors;
|
||||
let handle = match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone(), &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
),
|
||||
RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
S3::new(s3_config, &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
),
|
||||
};
|
||||
handle.map(Some)
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations with storage files.
|
||||
#[async_trait::async_trait]
|
||||
trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type StoragePath;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,75 +0,0 @@
|
||||
# Non-implementation details
|
||||
|
||||
This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
|
||||
Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules.
|
||||
Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
|
||||
|
||||
## Approach
|
||||
|
||||
Backup functionality is a new component, appeared way after the core DB functionality was implemented.
|
||||
Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
|
||||
|
||||
To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
|
||||
This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
|
||||
|
||||
## What's done
|
||||
|
||||
Current implementation
|
||||
* provides remote storage wrappers for AWS S3 and local FS
|
||||
* uploads layers, frozen by pageserver checkpoint thread
|
||||
* downloads and registers layers, found on the remote storage, but missing locally
|
||||
|
||||
No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
|
||||
It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
|
||||
|
||||
### Peculiarities
|
||||
|
||||
As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
|
||||
Here's the list of known compromises with comments:
|
||||
|
||||
* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
|
||||
This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish files and a metadata file, ~31 MB combined.
|
||||
AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
|
||||
Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
|
||||
|
||||
Storage sync API operates images when backing up or restoring a backup, so we're fluent to repack the layer contents the way we want to, which most probably will be done later.
|
||||
|
||||
* no proper file comparison
|
||||
|
||||
Currently, every layer contains `Lsn` in their name, to map the data it holds against a certain DB state.
|
||||
Then the images with same ids and different `Lsn`'s are compared, files are considered equal if their local file paths are equal (for remote files, "local file path" is their download destination).
|
||||
No file contents assertion is done currently, but should be.
|
||||
AWS S3 returns file checksums during the `list` operation, so that can be used to ensure the backup consistency, but that needs further research and, since current pageserver impl also needs to deal with layer file checksums.
|
||||
|
||||
For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
|
||||
|
||||
* sad rust-s3 api
|
||||
|
||||
rust-s3 is not very pleasant to use:
|
||||
1. it returns `anyhow::Result` and it's hard to distinguish "missing file" cases from "no connection" one, for instance
|
||||
2. at least one function it its API that we need (`get_object_stream`) has `async` keyword and blocks (!), see details [here](https://github.com/zenithdb/zenith/pull/752#discussion_r728373091)
|
||||
3. it's a prerelease library with unclear maintenance status
|
||||
4. noisy on debug level
|
||||
|
||||
But it's already used in the project, so for now it's reused to avoid bloating the dependency tree.
|
||||
Based on previous evaluation, even `rusoto-s3` could be a better choice over this library, but needs further benchmarking.
|
||||
|
||||
|
||||
* gc and branches are ignored
|
||||
|
||||
So far, we don't consider non-main images and don't adjust the remote storage based on GC thread loop results.
|
||||
Only checkpointer loop affects the remote storage.
|
||||
|
||||
* more layers should be downloaded on demand
|
||||
|
||||
Since we download and load remote layers into pageserver, there's a possibility a need for those layers' ancestors arise.
|
||||
Most probably, every downloaded image's ancestor is not present in locally too, but currently there's no logic for downloading such ancestors and their metadata,
|
||||
so the pageserver is unable to respond property on requests to such ancestors.
|
||||
|
||||
To implement the downloading, more `tenant_mgr` refactoring is needed to properly handle web requests for layers and handle the state changes.
|
||||
[Here](https://github.com/zenithdb/zenith/pull/689#issuecomment-931216193) are the details about initial state management updates needed.
|
||||
|
||||
* no IT tests
|
||||
|
||||
Automated S3 testing is lacking currently, due to no convenient way to enable backups during the tests.
|
||||
After it's fixed, benchmark runs should also be carried out to find bottlenecks.
|
||||
@@ -1,689 +0,0 @@
|
||||
//! Local filesystem acting as a remote storage.
|
||||
//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
|
||||
//!
|
||||
//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
future::Future,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage};
|
||||
|
||||
pub struct LocalFs {
|
||||
pageserver_workdir: &'static Path,
|
||||
root: PathBuf,
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Attempts to create local FS storage, along with its root directory.
|
||||
pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
format!(
|
||||
"Failed to create all directories in the given root path '{}'",
|
||||
root.display(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok(Self {
|
||||
pageserver_workdir,
|
||||
root,
|
||||
})
|
||||
}
|
||||
|
||||
fn resolve_in_storage(&self, path: &Path) -> anyhow::Result<PathBuf> {
|
||||
if path.is_relative() {
|
||||
Ok(self.root.join(path))
|
||||
} else if path.starts_with(&self.root) {
|
||||
Ok(path.to_path_buf())
|
||||
} else {
|
||||
bail!(
|
||||
"Path '{}' does not belong to the current storage",
|
||||
path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
type StoragePath = PathBuf;
|
||||
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
|
||||
Ok(self.root.join(
|
||||
strip_path_prefix(self.pageserver_workdir, local_path)
|
||||
.context("local path does not belong to this storage")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
|
||||
let relative_path = strip_path_prefix(&self.root, storage_path)
|
||||
.context("local path does not belong to this storage")?;
|
||||
Ok(self.pageserver_workdir.join(relative_path))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
|
||||
Ok(get_all_files(&self.root).await?.into_iter().collect())
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
create_target_directory(&target_file_path).await?;
|
||||
let mut destination = io::BufWriter::new(
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.open(&target_file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open target fs destination at '{}'",
|
||||
target_file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
|
||||
io::copy(&mut from, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload file to the local storage at '{}'",
|
||||
target_file_path.display()
|
||||
)
|
||||
})?;
|
||||
destination.flush().await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload file to the local storage at '{}'",
|
||||
target_file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
io::copy(&mut source, to).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to download file '{}' from the local storage",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
source.flush().await?;
|
||||
Ok(())
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
ensure!(
|
||||
end_exclusive > start_inclusive,
|
||||
"Invalid range, start ({}) is bigger then end ({:?})",
|
||||
start_inclusive,
|
||||
end_exclusive
|
||||
);
|
||||
if start_inclusive == end_exclusive.saturating_sub(1) {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")?;
|
||||
match end_exclusive {
|
||||
Some(end_exclusive) => {
|
||||
io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
|
||||
}
|
||||
None => io::copy(&mut source, to).await,
|
||||
}
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download file '{}' range from the local storage",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Path> + Send + Sync + 'a,
|
||||
{
|
||||
Box::pin(async move {
|
||||
let directory_path = directory_path.as_ref();
|
||||
if directory_path.exists() {
|
||||
if directory_path.is_dir() {
|
||||
let mut paths = Vec::new();
|
||||
let mut dir_contents = fs::read_dir(directory_path).await?;
|
||||
while let Some(dir_entry) = dir_contents.next_entry().await? {
|
||||
let file_type = dir_entry.file_type().await?;
|
||||
let entry_path = dir_entry.path();
|
||||
if file_type.is_symlink() {
|
||||
debug!("{:?} us a symlink, skipping", entry_path)
|
||||
} else if file_type.is_dir() {
|
||||
paths.extend(get_all_files(entry_path).await?.into_iter())
|
||||
} else {
|
||||
paths.push(dir_entry.path());
|
||||
}
|
||||
}
|
||||
Ok(paths)
|
||||
} else {
|
||||
bail!("Path '{}' is not a directory", directory_path.display())
|
||||
}
|
||||
} else {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
None => bail!(
|
||||
"File path '{}' has no parent directory",
|
||||
target_file_path.display()
|
||||
),
|
||||
};
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
|
||||
|
||||
assert_eq!(
|
||||
expected_path,
|
||||
storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
|
||||
"File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &LocalFs, mismatching_path: &Path) -> String {
|
||||
match storage.storage_path(mismatching_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected path '{}' to error, but got storage path: {:?}",
|
||||
mismatching_path.display(),
|
||||
wrong_path,
|
||||
),
|
||||
Err(e) => format!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("storage_path_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let error_string = storage_path_error(&storage, &repo_harness.conf.workdir);
|
||||
assert!(error_string.contains("does not belong to this storage"));
|
||||
assert!(error_string.contains(repo_harness.conf.workdir.to_str().unwrap()));
|
||||
|
||||
let mismatching_path_str = "/something/else";
|
||||
let error_message = storage_path_error(&storage, Path::new(mismatching_path_str));
|
||||
assert!(
|
||||
error_message.contains(mismatching_path_str),
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(error_message.contains("does not belong to this storage"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("local_path_positive")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let name = "not a metadata";
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
|
||||
assert_eq!(
|
||||
local_path,
|
||||
storage
|
||||
.local_path(
|
||||
&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
|
||||
)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
let local_metadata_path = repo_harness
|
||||
.timeline_path(&TIMELINE_ID)
|
||||
.join(METADATA_FILE_NAME);
|
||||
let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
|
||||
assert_eq!(
|
||||
local_metadata_path,
|
||||
storage
|
||||
.local_path(&remote_metadata_path)
|
||||
.expect("For a valid input, valid local path should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
#[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements
|
||||
fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
|
||||
match storage.local_path(storage_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected local path input {:?} to cause an error, but got file path: {:?}",
|
||||
storage_path, wrong_path,
|
||||
),
|
||||
Err(e) => format!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("local_path_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path));
|
||||
assert!(error_message.contains(totally_wrong_path));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
|
||||
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let dummy_storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
};
|
||||
|
||||
let storage_path = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&storage_path)?;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
"'original path -> storage path -> matching fs path' transformation should produce the same path as the input one for the correct path"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
use std::io::Write;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("upload_file")?;
|
||||
let storage = create_storage()?;
|
||||
|
||||
let source = create_file_for_upload(
|
||||
&storage.pageserver_workdir.join("whatever"),
|
||||
"whatever_contents",
|
||||
)
|
||||
.await?;
|
||||
let target_path = PathBuf::from("/").join("somewhere").join("else");
|
||||
match storage.upload(source, &target_path).await {
|
||||
Ok(()) => panic!("Should not allow storing files with wrong target path"),
|
||||
Err(e) => {
|
||||
let message = format!("{:?}", e);
|
||||
assert!(message.contains(&target_path.display().to_string()));
|
||||
assert!(message.contains("does not belong to the current storage"));
|
||||
}
|
||||
}
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
|
||||
assert_eq!(
|
||||
storage.list().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
|
||||
assert_eq!(
|
||||
list_files_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
"Should list a two different files after second upload"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_storage() -> anyhow::Result<LocalFs> {
|
||||
let pageserver_workdir = Box::leak(Box::new(tempdir()?.path().to_owned()));
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), pageserver_workdir)?;
|
||||
Ok(storage)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage.download(&upload_target, &mut content_bytes).await?;
|
||||
content_bytes.flush().await?;
|
||||
|
||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
contents,
|
||||
"We should upload and download the same contents"
|
||||
);
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage.download(&non_existing_path, &mut io::sink()).await {
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file_range_positive")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(&upload_target, 0, None, &mut full_range_bytes)
|
||||
.await?;
|
||||
full_range_bytes.flush().await?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
String::from_utf8(full_range_bytes.into_inner().into_inner())?,
|
||||
"Download full range should return the whole upload"
|
||||
);
|
||||
|
||||
let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let same_byte = 1_000_000_000;
|
||||
storage
|
||||
.download_range(
|
||||
&upload_target,
|
||||
same_byte,
|
||||
Some(same_byte + 1), // exclusive end
|
||||
&mut zero_range_bytes,
|
||||
)
|
||||
.await?;
|
||||
zero_range_bytes.flush().await?;
|
||||
assert!(
|
||||
zero_range_bytes.into_inner().into_inner().is_empty(),
|
||||
"Zero byte range should not download any part of the file"
|
||||
);
|
||||
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
"First part bytes should be returned when requrested"
|
||||
);
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
second_part_remote.flush().await?;
|
||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
second_part_local,
|
||||
second_part_remote.as_slice(),
|
||||
"Second part bytes should be returned when requrested"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_negative() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file_range_negative")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
let start = 10000;
|
||||
let end = 234;
|
||||
assert!(start > end, "Should test an incorrect range");
|
||||
match storage
|
||||
.download_range(&upload_target, start, Some(end), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("Invalid range"));
|
||||
assert!(error_string.contains(&start.to_string()));
|
||||
assert!(error_string.contains(&end.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage
|
||||
.download_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("delete_file")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
match storage.delete(&upload_target).await {
|
||||
Ok(()) => panic!("Should not allow deleting non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&upload_target.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn upload_dummy_file(
|
||||
harness: &RepoHarness,
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
|
||||
let storage_path = storage.root.join(relative_timeline_path).join(name);
|
||||
storage
|
||||
.upload(
|
||||
create_file_for_upload(
|
||||
&storage.pageserver_workdir.join(name),
|
||||
&dummy_contents(name),
|
||||
)
|
||||
.await?,
|
||||
&storage_path,
|
||||
)
|
||||
.await?;
|
||||
Ok(storage_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
path: &Path,
|
||||
contents: &str,
|
||||
) -> anyhow::Result<io::BufReader<fs::File>> {
|
||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create_new(true)
|
||||
.open(path)?;
|
||||
write!(file_for_writing, "{}", contents)?;
|
||||
drop(file_for_writing);
|
||||
Ok(io::BufReader::new(
|
||||
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||
))
|
||||
}
|
||||
|
||||
fn dummy_contents(name: &str) -> String {
|
||||
format!("contents for {}", name)
|
||||
}
|
||||
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut files = storage.list().await?;
|
||||
files.sort();
|
||||
Ok(files)
|
||||
}
|
||||
}
|
||||
@@ -1,373 +0,0 @@
|
||||
//! AWS S3 storage wrapper around `rust_s3` library.
|
||||
//! Currently does not allow multiple pageservers to use the same bucket concurrently: objects are
|
||||
//! placed in the root of the bucket.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
use tokio::io::{self, AsyncWriteExt};
|
||||
|
||||
use crate::{
|
||||
remote_storage::{strip_path_prefix, RemoteStorage},
|
||||
S3Config,
|
||||
};
|
||||
|
||||
const S3_FILE_SEPARATOR: char = '/';
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
pub struct S3ObjectKey(String);
|
||||
|
||||
impl S3ObjectKey {
|
||||
fn key(&self) -> &str {
|
||||
&self.0
|
||||
}
|
||||
|
||||
fn download_destination(&self, pageserver_workdir: &Path) -> PathBuf {
|
||||
pageserver_workdir.join(self.0.split(S3_FILE_SEPARATOR).collect::<PathBuf>())
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3 {
|
||||
pageserver_workdir: &'static Path,
|
||||
bucket: Bucket,
|
||||
}
|
||||
|
||||
impl S3 {
|
||||
/// Creates the storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
let region = aws_config
|
||||
.bucket_region
|
||||
.parse::<Region>()
|
||||
.context("Failed to parse the s3 region from config")?;
|
||||
let credentials = Credentials::new(
|
||||
aws_config.access_key_id.as_deref(),
|
||||
aws_config.secret_access_key.as_deref(),
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.context("Failed to create the s3 credentials")?;
|
||||
Ok(Self {
|
||||
bucket: Bucket::new_with_path_style(
|
||||
aws_config.bucket_name.as_str(),
|
||||
region,
|
||||
credentials,
|
||||
)
|
||||
.context("Failed to create the s3 bucket")?,
|
||||
pageserver_workdir,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3 {
|
||||
type StoragePath = S3ObjectKey;
|
||||
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
|
||||
let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
|
||||
let mut key = String::new();
|
||||
for segment in relative_path {
|
||||
key.push(S3_FILE_SEPARATOR);
|
||||
key.push_str(&segment.to_string_lossy());
|
||||
}
|
||||
Ok(S3ObjectKey(key))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
|
||||
Ok(storage_path.download_destination(self.pageserver_workdir))
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
|
||||
let list_response = self
|
||||
.bucket
|
||||
.list(String::new(), None)
|
||||
.await
|
||||
.context("Failed to list s3 objects")?;
|
||||
|
||||
Ok(list_response
|
||||
.into_iter()
|
||||
.flat_map(|response| response.contents)
|
||||
.map(|s3_object| S3ObjectKey(s3_object.key))
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(&mut from, &mut upload_contents)
|
||||
.await
|
||||
.context("Failed to read the upload contents")?;
|
||||
upload_contents
|
||||
.flush()
|
||||
.await
|
||||
.context("Failed to read the upload contents")?;
|
||||
let upload_contents = upload_contents.into_inner().into_inner();
|
||||
|
||||
let (_, code) = self
|
||||
.bucket
|
||||
.put_object(to.key(), &upload_contents)
|
||||
.await
|
||||
.with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during creating object with key '{}', code: {}",
|
||||
to.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
let (data, code) = self
|
||||
.bucket
|
||||
.get_object(from.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during downloading object, code: {}",
|
||||
code
|
||||
))
|
||||
} else {
|
||||
// we don't have to write vector into the destination this way, `to_write_all` would be enough.
|
||||
// but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with
|
||||
// which it makes more sense to use `io::copy`.
|
||||
io::copy(&mut data.as_slice(), to)
|
||||
.await
|
||||
.context("Failed to write downloaded data into the destination buffer")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
||||
// and needs both ends to be exclusive
|
||||
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
|
||||
let (data, code) = self
|
||||
.bucket
|
||||
.get_object_range(from.key(), start_inclusive, end_inclusive)
|
||||
.await
|
||||
.with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
|
||||
if code != 206 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-206 exit code during downloading object range, code: {}",
|
||||
code
|
||||
))
|
||||
} else {
|
||||
// see `download` function above for the comment on why `Vec<u8>` buffer is copied this way
|
||||
io::copy(&mut data.as_slice(), to)
|
||||
.await
|
||||
.context("Failed to write downloaded range into the destination buffer")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
let (_, code) = self
|
||||
.bucket
|
||||
.delete_object(path.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to delete s3 object with key {}", path.key()))?;
|
||||
if code != 204 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-204 exit code during deleting object with key '{}', code: {}",
|
||||
path.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn download_destination() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination")?;
|
||||
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("test_name");
|
||||
let relative_path = local_path.strip_prefix(&repo_harness.conf.workdir)?;
|
||||
|
||||
let key = S3ObjectKey(format!(
|
||||
"{}{}",
|
||||
S3_FILE_SEPARATOR,
|
||||
relative_path
|
||||
.iter()
|
||||
.map(|segment| segment.to_str().unwrap())
|
||||
.collect::<Vec<_>>()
|
||||
.join(&S3_FILE_SEPARATOR.to_string()),
|
||||
));
|
||||
|
||||
assert_eq!(
|
||||
local_path,
|
||||
key.download_destination(&repo_harness.conf.workdir),
|
||||
"Download destination should consist of s3 path joined with the pageserver workdir prefix"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
|
||||
let segment_1 = "matching";
|
||||
let segment_2 = "file";
|
||||
let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
|
||||
let expected_key = S3ObjectKey(format!(
|
||||
"{SEPARATOR}{}{SEPARATOR}{}",
|
||||
segment_1,
|
||||
segment_2,
|
||||
SEPARATOR = S3_FILE_SEPARATOR,
|
||||
));
|
||||
|
||||
let actual_key = dummy_storage(&repo_harness.conf.workdir)
|
||||
.storage_path(local_path)
|
||||
.expect("Matching path should map to S3 path normally");
|
||||
assert_eq!(
|
||||
expected_key,
|
||||
actual_key,
|
||||
"S3 key from the matching path should contain all segments after the workspace prefix, separated with S3 separator"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn storage_path_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_path_error(storage: &S3, mismatching_path: &Path) -> String {
|
||||
match storage.storage_path(mismatching_path) {
|
||||
Ok(wrong_key) => panic!(
|
||||
"Expected path '{}' to error, but got S3 key: {:?}",
|
||||
mismatching_path.display(),
|
||||
wrong_key,
|
||||
),
|
||||
Err(e) => e.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("storage_path_negatives")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let error_message = storage_path_error(&storage, &repo_harness.conf.workdir);
|
||||
assert!(
|
||||
error_message.contains("Prefix and the path are equal"),
|
||||
"Message '{}' does not contain the required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
let mismatching_path = PathBuf::from("somewhere").join("else");
|
||||
let error_message = storage_path_error(&storage, &mismatching_path);
|
||||
assert!(
|
||||
error_message.contains(mismatching_path.to_str().unwrap()),
|
||||
"Error should mention wrong path"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains(repo_harness.conf.workdir.to_str().unwrap()),
|
||||
"Error should mention server workdir"
|
||||
);
|
||||
assert!(
|
||||
error_message.contains("is not prefixed with"),
|
||||
"Message '{}' does not contain a required string",
|
||||
error_message
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("local_path_positive")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
|
||||
|
||||
let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
);
|
||||
|
||||
let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
let original_path = repo_harness.timeline_path(&TIMELINE_ID).join("some name");
|
||||
|
||||
let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let key = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&key)?;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
"'original path -> storage key -> matching fs path' transformation should produce the same path as the input one for the correct path"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn dummy_storage(pageserver_workdir: &'static Path) -> S3 {
|
||||
S3 {
|
||||
pageserver_workdir,
|
||||
bucket: Bucket::new(
|
||||
"dummy-bucket",
|
||||
"us-east-1".parse().unwrap(),
|
||||
Credentials::anonymous().unwrap(),
|
||||
)
|
||||
.unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_key(relative_file_path: &Path) -> S3ObjectKey {
|
||||
S3ObjectKey(
|
||||
relative_file_path
|
||||
.iter()
|
||||
.fold(String::new(), |mut path_string, segment| {
|
||||
path_string.push(S3_FILE_SEPARATOR);
|
||||
path_string.push_str(segment.to_str().unwrap());
|
||||
path_string
|
||||
}),
|
||||
)
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,9 @@
|
||||
use crate::relish::*;
|
||||
use crate::CheckpointConfig;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::ops::{AddAssign, Deref};
|
||||
use std::ops::AddAssign;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use zenith_utils::lsn::{Lsn, RecordLsn};
|
||||
@@ -14,28 +13,18 @@ use zenith_utils::zid::ZTimelineId;
|
||||
/// A repository corresponds to one .zenith directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
pub trait Repository: Send + Sync {
|
||||
fn shutdown(&self) -> Result<()>;
|
||||
|
||||
/// Stops all timeline-related process in the repository and removes the timeline data from memory.
|
||||
fn unload_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Result<Arc<dyn Timeline>>;
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// perform one garbage collection iteration, removing old data files from disk.
|
||||
/// this funtion is periodically called by gc thread.
|
||||
/// also it can be explicitly requested through page server api 'do_gc' command.
|
||||
/// perform one garbage collection iteration.
|
||||
/// garbage collection is periodically performed by gc thread,
|
||||
/// but it can be explicitly requested through page server api.
|
||||
///
|
||||
/// 'timelineid' specifies the timeline to GC, or None for all.
|
||||
/// `horizon` specifies delta from last lsn to preserve all object versions (pitr interval).
|
||||
@@ -48,10 +37,6 @@ pub trait Repository: Send + Sync {
|
||||
horizon: u64,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult>;
|
||||
|
||||
/// perform one checkpoint iteration, flushing in-memory data on disk.
|
||||
/// this function is periodically called by checkponter thread.
|
||||
fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()>;
|
||||
}
|
||||
|
||||
///
|
||||
@@ -132,64 +117,17 @@ pub trait Timeline: Send + Sync {
|
||||
/// Get a list of all existing non-relational objects
|
||||
fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
/// Get the ancestor's timeline id
|
||||
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId>;
|
||||
|
||||
/// Get the LSN where this branch was created
|
||||
fn get_ancestor_lsn(&self) -> Lsn;
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Public PUT functions, to update the repository with new page versions.
|
||||
//
|
||||
// These are called by the WAL receiver to digest WAL records.
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
/// Atomically get both last and prev.
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn;
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
fn get_disk_consistent_lsn(&self) -> Lsn;
|
||||
|
||||
/// Mutate the timeline with a [`TimelineWriter`].
|
||||
fn writer<'a>(&'a self) -> Box<dyn TimelineWriter + 'a>;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
|
||||
|
||||
///
|
||||
/// Check that it is valid to request operations with that lsn.
|
||||
fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors,
|
||||
/// doesnt support TwoPhase relishes yet
|
||||
fn get_current_logical_size(&self) -> usize;
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used in tests to ensure thet incremental and non incremental variants match.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
|
||||
/// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
|
||||
fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline;
|
||||
}
|
||||
|
||||
/// Various functions to mutate the timeline.
|
||||
// TODO Currently, Deref is used to allow easy access to read methods from this trait.
|
||||
// This is probably considered a bad practice in Rust and should be fixed eventually,
|
||||
// but will cause large code changes.
|
||||
pub trait TimelineWriter: Deref<Target = dyn Timeline> {
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put_wal_record(&self, lsn: Lsn, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
|
||||
fn put_wal_record(&self, tag: RelishTag, blknum: u32, rec: WALRecord) -> Result<()>;
|
||||
|
||||
/// Like put_wal_record, but with ready-made image of the page.
|
||||
fn put_page_image(&self, tag: RelishTag, blknum: u32, lsn: Lsn, img: Bytes) -> Result<()>;
|
||||
@@ -205,10 +143,34 @@ pub trait TimelineWriter: Deref<Target = dyn Timeline> {
|
||||
/// Advance requires aligned LSN as an argument and would wake wait_lsn() callers.
|
||||
/// Previous last record LSN is stored alongside the latest and can be read.
|
||||
fn advance_last_record_lsn(&self, lsn: Lsn);
|
||||
/// Atomically get both last and prev.
|
||||
fn get_last_record_rlsn(&self) -> RecordLsn;
|
||||
/// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
|
||||
fn get_last_record_lsn(&self) -> Lsn;
|
||||
fn get_prev_record_lsn(&self) -> Lsn;
|
||||
fn get_start_lsn(&self) -> Lsn;
|
||||
|
||||
///
|
||||
/// Flush to disk all data that was written with the put_* functions
|
||||
///
|
||||
/// NOTE: This has nothing to do with checkpoint in PostgreSQL. We don't
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self) -> Result<()>;
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors,
|
||||
/// doesnt support TwoPhase relishes yet
|
||||
fn get_current_logical_size(&self) -> usize;
|
||||
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used in tests to ensure thet incremental and non incremental variants match.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct WALRecord {
|
||||
pub lsn: Lsn, // LSN at the *end* of the record
|
||||
pub will_init: bool,
|
||||
pub rec: Bytes,
|
||||
// Remember the offset of main_data in rec,
|
||||
@@ -217,101 +179,25 @@ pub struct WALRecord {
|
||||
pub main_data_offset: u32,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod repo_harness {
|
||||
use bytes::BytesMut;
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use crate::{
|
||||
layered_repository::{LayeredRepository, TIMELINES_SEGMENT_NAME},
|
||||
walredo::{WalRedoError, WalRedoManager},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use hex_literal::hex;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
pub const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
pub const NEW_TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("AA223344556677881122334455667788"));
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
pub fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(8192, 0);
|
||||
|
||||
buf.freeze()
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u64(self.lsn.0);
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
|
||||
pub struct RepoHarness {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_id: ZTenantId,
|
||||
}
|
||||
|
||||
impl RepoHarness {
|
||||
pub fn create(test_name: &'static str) -> Result<Self> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join(TIMELINES_SEGMENT_NAME))?;
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
|
||||
Ok(Self { conf, tenant_id })
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Box<dyn Repository> {
|
||||
let walredo_mgr = Arc::new(TestRedoManager);
|
||||
|
||||
Box::new(LayeredRepository::new(
|
||||
self.conf,
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
false,
|
||||
))
|
||||
}
|
||||
|
||||
pub fn timeline_path(&self, timeline_id: &ZTimelineId) -> PathBuf {
|
||||
self.conf.timeline_path(timeline_id, &self.tenant_id)
|
||||
}
|
||||
}
|
||||
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager;
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} blk {} to get to {}, with {} and {} records",
|
||||
rel,
|
||||
blknum,
|
||||
lsn,
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
println!("{}", s);
|
||||
Ok(TEST_IMG(&s))
|
||||
pub fn unpack(buf: &mut Bytes) -> WALRecord {
|
||||
let lsn = Lsn::from(buf.get_u64());
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let mut dst = vec![0u8; buf.get_u32() as usize];
|
||||
buf.copy_to_slice(&mut dst);
|
||||
WALRecord {
|
||||
lsn,
|
||||
will_init,
|
||||
rec: Bytes::from(dst),
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -322,21 +208,23 @@ pub mod repo_harness {
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::layered_repository::metadata::METADATA_FILE_NAME;
|
||||
|
||||
use super::repo_harness::*;
|
||||
use super::*;
|
||||
use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::walredo::{WalRedoError, WalRedoManager};
|
||||
use crate::PageServerConf;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::xlog_utils::SIZEOF_CHECKPOINT;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A_REL_TAG: RelTag = RelTag {
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
};
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(TESTREL_A_REL_TAG);
|
||||
});
|
||||
const TESTREL_B: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
@@ -344,6 +232,16 @@ mod tests {
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
#[allow(non_snake_case)]
|
||||
fn TEST_IMG(s: &str) -> Bytes {
|
||||
let mut buf = BytesMut::new();
|
||||
buf.extend_from_slice(s.as_bytes());
|
||||
buf.resize(8192, 0);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn assert_current_logical_size(timeline: &Arc<dyn Timeline>, lsn: Lsn) {
|
||||
let incremental = timeline.get_current_logical_size();
|
||||
let non_incremental = timeline
|
||||
@@ -355,23 +253,47 @@ mod tests {
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
|
||||
|
||||
fn get_test_repo(test_name: &str) -> Result<Box<dyn Repository>> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
fs::create_dir_all(&repo_dir)?;
|
||||
fs::create_dir_all(&repo_dir.join("timelines"))?;
|
||||
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
let tenantid = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenantid)).unwrap();
|
||||
|
||||
let walredo_mgr = TestRedoManager {};
|
||||
|
||||
let repo = Box::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_relsize() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_relsize")?.load();
|
||||
let repo = get_test_repo("test_relsize")?;
|
||||
// get_timeline() with non-existent timeline id should fail
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
writer.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x40), TEST_IMG("foo blk 1 at 4"))?;
|
||||
tline.put_page_image(TESTREL_A, 2, Lsn(0x50), TEST_IMG("foo blk 2 at 5"))?;
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x50));
|
||||
tline.advance_last_record_lsn(Lsn(0x50));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(0x50));
|
||||
|
||||
@@ -417,8 +339,8 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate last block
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 2)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
assert_current_logical_size(&tline, Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
@@ -440,13 +362,13 @@ mod tests {
|
||||
);
|
||||
|
||||
// Truncate to zero length
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x68));
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x68), 0)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x68));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x68))?.unwrap(), 0);
|
||||
|
||||
// Extend from 0 to 2 blocks, leaving a gap
|
||||
writer.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x70));
|
||||
tline.put_page_image(TESTREL_A, 1, Lsn(0x70), TEST_IMG("foo blk 1"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x70));
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x70))?.unwrap(), 2);
|
||||
assert_eq!(tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x70))?, ZERO_PAGE);
|
||||
assert_eq!(
|
||||
@@ -477,30 +399,30 @@ mod tests {
|
||||
// and then created it again within the same layer.
|
||||
#[test]
|
||||
fn test_drop_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_drop_extend")?.load();
|
||||
let repo = get_test_repo("test_drop_extend")?;
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x20))?, true);
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x20))?.unwrap(), 1);
|
||||
|
||||
// Drop relish
|
||||
writer.drop_relish(TESTREL_A, Lsn(0x30))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
tline.drop_relish(TESTREL_A, Lsn(0x30))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that rel is not visible anymore
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x30))?, false);
|
||||
assert!(tline.get_relish_size(TESTREL_A, Lsn(0x30))?.is_none());
|
||||
|
||||
// Extend it again
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
tline.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check that rel exists and size is correct
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x40))?, true);
|
||||
@@ -514,11 +436,11 @@ mod tests {
|
||||
// and then extended it again within the same layer.
|
||||
#[test]
|
||||
fn test_truncate_extend() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_truncate_extend")?.load();
|
||||
let repo = get_test_repo("test_truncate_extend")?;
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
//from storage_layer.rs
|
||||
const RELISH_SEG_SIZE: u32 = 10 * 1024 * 1024 / 8192;
|
||||
@@ -528,10 +450,10 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x20);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x20));
|
||||
tline.advance_last_record_lsn(Lsn(0x20));
|
||||
|
||||
// The relation was created at LSN 2, not visible at LSN 1 yet.
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x10))?, false);
|
||||
@@ -555,8 +477,8 @@ mod tests {
|
||||
|
||||
// Truncate relation so that second segment was dropped
|
||||
// - only leave one page
|
||||
writer.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(0x60));
|
||||
tline.put_truncation(TESTREL_A, Lsn(0x60), 1)?;
|
||||
tline.advance_last_record_lsn(Lsn(0x60));
|
||||
|
||||
// Check reported size and contents after truncation
|
||||
assert_eq!(tline.get_relish_size(TESTREL_A, Lsn(0x60))?.unwrap(), 1);
|
||||
@@ -589,9 +511,9 @@ mod tests {
|
||||
for blkno in 0..relsize {
|
||||
let lsn = Lsn(0x80);
|
||||
let data = format!("foo blk {} at {}", blkno, lsn);
|
||||
writer.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
tline.put_page_image(TESTREL_A, blkno, lsn, TEST_IMG(&data))?;
|
||||
}
|
||||
writer.advance_last_record_lsn(Lsn(0x80));
|
||||
tline.advance_last_record_lsn(Lsn(0x80));
|
||||
|
||||
assert_eq!(tline.get_rel_exists(TESTREL_A, Lsn(0x80))?, true);
|
||||
assert_eq!(
|
||||
@@ -615,17 +537,17 @@ mod tests {
|
||||
/// split into multiple 1 GB segments in Postgres.
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
let repo = get_test_repo("test_large_rel")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
let mut lsn = 0x10;
|
||||
for blknum in 0..pg_constants::RELSEG_SIZE + 1 {
|
||||
let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
tline.put_page_image(TESTREL_A, blknum as u32, Lsn(lsn), img)?;
|
||||
}
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
|
||||
assert_current_logical_size(&tline, Lsn(lsn));
|
||||
|
||||
@@ -636,8 +558,8 @@ mod tests {
|
||||
|
||||
// Truncate one block
|
||||
lsn += 0x10;
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE
|
||||
@@ -646,8 +568,8 @@ mod tests {
|
||||
|
||||
// Truncate another block
|
||||
lsn += 0x10;
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), pg_constants::RELSEG_SIZE - 1)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
pg_constants::RELSEG_SIZE - 1
|
||||
@@ -659,8 +581,8 @@ mod tests {
|
||||
let mut size: i32 = 3000;
|
||||
while size >= 0 {
|
||||
lsn += 0x10;
|
||||
writer.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
writer.advance_last_record_lsn(Lsn(lsn));
|
||||
tline.put_truncation(TESTREL_A, Lsn(lsn), size as u32)?;
|
||||
tline.advance_last_record_lsn(Lsn(lsn));
|
||||
assert_eq!(
|
||||
tline.get_relish_size(TESTREL_A, Lsn(lsn))?.unwrap(),
|
||||
size as u32
|
||||
@@ -678,19 +600,19 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_list_rels_drop() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_list_rels_drop")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
let repo = get_test_repo("test_list_rels_drop")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x30));
|
||||
tline.advance_last_record_lsn(Lsn(0x30));
|
||||
|
||||
// Check that list_rels() lists it after LSN 2, but no before it
|
||||
assert!(!tline.list_rels(0, TESTDB, Lsn(0x10))?.contains(&TESTREL_A));
|
||||
@@ -698,19 +620,17 @@ mod tests {
|
||||
assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
|
||||
|
||||
// Create a branch, check that the relation is visible there
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
assert!(newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x30))?
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
// Drop it on the branch
|
||||
new_writer.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
drop(new_writer);
|
||||
newtline.drop_relish(TESTREL_A, Lsn(0x40))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check that it's no longer listed on the branch after the point where it was dropped
|
||||
assert!(newtline
|
||||
@@ -721,8 +641,8 @@ mod tests {
|
||||
.contains(&TESTREL_A));
|
||||
|
||||
// Run checkpoint and garbage collection and check that it's still not visible
|
||||
newtline.checkpoint(CheckpointConfig::Forced)?;
|
||||
repo.gc_iteration(Some(NEW_TIMELINE_ID), 0, true)?;
|
||||
newtline.checkpoint()?;
|
||||
repo.gc_iteration(Some(newtimelineid), 0, true)?;
|
||||
|
||||
assert!(!newtline
|
||||
.list_rels(0, TESTDB, Lsn(0x40))?
|
||||
@@ -736,32 +656,32 @@ mod tests {
|
||||
///
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let writer = tline.writer();
|
||||
let repo = get_test_repo("test_branch")?;
|
||||
let timelineid = ZTimelineId::from_str("11223344556677881122334455667788").unwrap();
|
||||
let tline = repo.create_empty_timeline(timelineid)?;
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
// after branching fails below
|
||||
writer.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
tline.put_page_image(RelishTag::Checkpoint, 0, Lsn(0x10), ZERO_CHECKPOINT.clone())?;
|
||||
|
||||
// Create a relation on the timeline
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x30), TEST_IMG("foo blk 0 at 3"))?;
|
||||
tline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("foo blk 0 at 4"))?;
|
||||
|
||||
// Create another relation
|
||||
writer.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
tline.put_page_image(TESTREL_B, 0, Lsn(0x20), TEST_IMG("foobar blk 0 at 2"))?;
|
||||
|
||||
writer.advance_last_record_lsn(Lsn(0x40));
|
||||
tline.advance_last_record_lsn(Lsn(0x40));
|
||||
assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
let new_writer = newtline.writer();
|
||||
let newtimelineid = ZTimelineId::from_str("AA223344556677881122334455667788").unwrap();
|
||||
repo.branch_timeline(timelineid, newtimelineid, Lsn(0x30))?;
|
||||
let newtline = repo.get_timeline(newtimelineid)?;
|
||||
|
||||
new_writer.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
new_writer.advance_last_record_lsn(Lsn(0x40));
|
||||
newtline.put_page_image(TESTREL_A, 0, Lsn(0x40), TEST_IMG("bar blk 0 at 4"))?;
|
||||
newtline.advance_last_record_lsn(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
assert_eq!(
|
||||
@@ -786,302 +706,32 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_some_layers(tline: &Arc<dyn Timeline>, start_lsn: Lsn) -> Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
// Create a relation on the timeline
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
// Mock WAL redo manager that doesn't do much
|
||||
struct TestRedoManager {}
|
||||
|
||||
impl WalRedoManager for TestRedoManager {
|
||||
fn request_redo(
|
||||
&self,
|
||||
rel: RelishTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let s = format!(
|
||||
"redo for {} blk {} to get to {}, with {} and {} records",
|
||||
rel,
|
||||
blknum,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("we might've already garbage collected needed data"))
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
||||
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(&err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("is earlier than initdb lsn"));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")?
|
||||
.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) {
|
||||
Ok(_) => panic!("request for page should have failed"),
|
||||
Err(err) => assert!(err
|
||||
.to_string()
|
||||
.contains("tried to request a page version that was garbage collected")),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
|
||||
let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?;
|
||||
let repo = harness.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
|
||||
make_some_layers(&newtline, Lsn(0x60))?;
|
||||
|
||||
// run gc on parent
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
// check that the layer in parent before the branching point is still there
|
||||
let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id);
|
||||
|
||||
let expected_image_layer_path = tline_dir.join(format!(
|
||||
"rel_{}_{}_{}_{}_{}_{:016X}_{:016X}",
|
||||
TESTREL_A_REL_TAG.spcnode,
|
||||
TESTREL_A_REL_TAG.dbnode,
|
||||
TESTREL_A_REL_TAG.relnode,
|
||||
TESTREL_A_REL_TAG.forknum,
|
||||
0, // seg is 0
|
||||
0x20,
|
||||
0x30,
|
||||
));
|
||||
assert!(fs::metadata(&expected_image_layer_path).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
drop(repo);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
|
||||
assert!(metadata_path.is_file());
|
||||
|
||||
let mut metadata_bytes = std::fs::read(&metadata_path)?;
|
||||
assert_eq!(metadata_bytes.len(), 512);
|
||||
metadata_bytes[512 - 4 - 2] ^= 1;
|
||||
std::fs::write(metadata_path, metadata_bytes)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
|
||||
assert_eq!(err.to_string(), "failed to load metadata");
|
||||
assert_eq!(
|
||||
err.source().unwrap().to_string(),
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn future_layerfiles() -> Result<()> {
|
||||
const TEST_NAME: &str = "future_layerfiles";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
// Create a timeline with disk_consistent_lsn = 8000
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
|
||||
let writer = tline.writer();
|
||||
writer.advance_last_record_lsn(Lsn(0x8000));
|
||||
drop(writer);
|
||||
repo.checkpoint_iteration(CheckpointConfig::Forced)?;
|
||||
drop(repo);
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let make_empty_file = |filename: &str| -> std::io::Result<()> {
|
||||
let path = timeline_path.join(filename);
|
||||
|
||||
assert!(!path.exists());
|
||||
std::fs::write(&path, &[])?;
|
||||
|
||||
Ok(())
|
||||
};
|
||||
|
||||
// Helper function to check that a relation file exists, and a corresponding
|
||||
// <filename>.0.old file does not.
|
||||
let assert_exists = |filename: &str| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(path.exists(), "file {} was removed", filename);
|
||||
|
||||
// Check that there is no .old file
|
||||
let backup_path = timeline_path.join(format!("{}.0.old", filename));
|
||||
assert!(
|
||||
!backup_path.exists(),
|
||||
"unexpected backup file {}",
|
||||
backup_path.display()
|
||||
if base_img.is_some() {
|
||||
"base image"
|
||||
} else {
|
||||
"no base image"
|
||||
},
|
||||
records.len()
|
||||
);
|
||||
};
|
||||
|
||||
// Helper function to check that a relation file does *not* exists, and a corresponding
|
||||
// <filename>.<num>.old file does.
|
||||
let assert_is_renamed = |filename: &str, num: u32| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(
|
||||
!path.exists(),
|
||||
"file {} was not removed as expected",
|
||||
filename
|
||||
);
|
||||
|
||||
let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
|
||||
assert!(
|
||||
backup_path.exists(),
|
||||
"backup file {} was not created",
|
||||
backup_path.display()
|
||||
);
|
||||
};
|
||||
|
||||
// These files are considered to be in the future and will be renamed out
|
||||
// of the way
|
||||
let future_filenames = vec![
|
||||
format!("pg_control_0_{:016X}", 0x8001),
|
||||
format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008),
|
||||
];
|
||||
// But these are not:
|
||||
let past_filenames = vec![
|
||||
format!("pg_control_0_{:016X}", 0x8000),
|
||||
format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001),
|
||||
];
|
||||
|
||||
for filename in future_filenames.iter().chain(past_filenames.iter()) {
|
||||
make_empty_file(filename)?;
|
||||
println!("{}", s);
|
||||
Ok(TEST_IMG(&s))
|
||||
}
|
||||
|
||||
// Load the timeline. This will cause the files in the "future" to be renamed
|
||||
// away.
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
for filename in future_filenames.iter() {
|
||||
assert_is_renamed(filename, 0);
|
||||
}
|
||||
for filename in past_filenames.iter() {
|
||||
assert_exists(filename);
|
||||
}
|
||||
|
||||
// Create the future files again, and load again. They should be renamed to
|
||||
// *.1.old this time.
|
||||
for filename in future_filenames.iter() {
|
||||
make_empty_file(filename)?;
|
||||
}
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
for filename in future_filenames.iter() {
|
||||
assert_is_renamed(filename, 0);
|
||||
assert_is_renamed(filename, 1);
|
||||
}
|
||||
for filename in past_filenames.iter() {
|
||||
assert_exists(filename);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,17 +2,17 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! zenith Timeline.
|
||||
//!
|
||||
use log::*;
|
||||
use postgres_ffi::nonrelfile_utils::clogpage_precedes;
|
||||
use postgres_ffi::nonrelfile_utils::slru_may_delete_clogsegment;
|
||||
use std::cmp::min;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::io::Read;
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use tracing::*;
|
||||
use anyhow::Result;
|
||||
use bytes::{Buf, Bytes};
|
||||
|
||||
use crate::relish::*;
|
||||
use crate::repository::*;
|
||||
@@ -34,11 +34,9 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
///
|
||||
pub fn import_timeline_from_postgres_datadir(
|
||||
path: &Path,
|
||||
writer: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
// Scan 'global'
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
@@ -46,10 +44,10 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
None => continue,
|
||||
|
||||
Some("pg_control") => {
|
||||
pg_control = Some(import_control_file(writer, lsn, &direntry.path())?);
|
||||
import_control_file(timeline, lsn, &direntry.path())?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::GLOBALTABLESPACE_OID,
|
||||
@@ -61,7 +59,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
@@ -88,7 +86,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
|
||||
Some("PG_VERSION") => continue,
|
||||
Some("pg_filenode.map") => import_nonrel_file(
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
RelishTag::FileNodeMap {
|
||||
spcnode: pg_constants::DEFAULTTABLESPACE_OID,
|
||||
@@ -100,7 +98,7 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// Load any relation files into the page server
|
||||
_ => import_relfile(
|
||||
&direntry.path(),
|
||||
writer,
|
||||
timeline,
|
||||
lsn,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
@@ -110,36 +108,24 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::Clog, &entry.path())?;
|
||||
import_slru_file(timeline, lsn, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(writer, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
import_slru_file(timeline, lsn, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(entry.path().to_str().unwrap(), 16)?;
|
||||
import_nonrel_file(writer, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
import_nonrel_file(timeline, lsn, RelishTag::TwoPhase { xid }, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
|
||||
// Import WAL. This is needed even when starting from a shutdown checkpoint, because
|
||||
// this reads the checkpoint record itself, advancing the tip of the timeline to
|
||||
// *after* the checkpoint record. And crucially, it initializes the 'prev_lsn'
|
||||
let pg_control = pg_control.ok_or_else(|| anyhow!("pg_control file not found"))?;
|
||||
import_wal(
|
||||
&path.join("pg_wal"),
|
||||
writer,
|
||||
Lsn(pg_control.checkPointCopy.redo),
|
||||
lsn,
|
||||
&mut pg_control.checkPointCopy.clone(),
|
||||
)?;
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -147,13 +133,12 @@ pub fn import_timeline_from_postgres_datadir(
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_relfile(
|
||||
path: &Path,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
) -> Result<()> {
|
||||
// Does it look like a relation file?
|
||||
trace!("importing rel file {}", path.display());
|
||||
|
||||
let p = parse_relfilename(path.file_name().unwrap().to_str().unwrap());
|
||||
if let Err(e) = p {
|
||||
@@ -181,14 +166,15 @@ fn import_relfile(
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(err) => match err.kind() {
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -205,7 +191,7 @@ fn import_relfile(
|
||||
/// are just slurped into the repository as one blob.
|
||||
///
|
||||
fn import_nonrel_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
tag: RelishTag,
|
||||
path: &Path,
|
||||
@@ -215,7 +201,7 @@ fn import_nonrel_file(
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing non-rel file {}", path.display());
|
||||
info!("importing non-rel file {}", path.display());
|
||||
|
||||
timeline.put_page_image(tag, 0, lsn, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
@@ -226,17 +212,13 @@ fn import_nonrel_file(
|
||||
///
|
||||
/// The control file is imported as is, but we also extract the checkpoint record
|
||||
/// from it and store it separated.
|
||||
fn import_control_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
path: &Path,
|
||||
) -> Result<ControlFileData> {
|
||||
fn import_control_file(timeline: &dyn Timeline, lsn: Lsn, path: &Path) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing control file {}", path.display());
|
||||
info!("importing control file {}", path.display());
|
||||
|
||||
// Import it as ControlFile
|
||||
timeline.put_page_image(
|
||||
@@ -251,24 +233,19 @@ fn import_control_file(
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
timeline.put_page_image(RelishTag::Checkpoint, 0, lsn, checkpoint_bytes)?;
|
||||
|
||||
Ok(pg_control)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru_file(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
fn import_slru_file(timeline: &dyn Timeline, lsn: Lsn, slru: SlruKind, path: &Path) -> Result<()> {
|
||||
// Does it look like an SLRU file?
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let segno = u32::from_str_radix(path.file_name().unwrap().to_str().unwrap(), 16)?;
|
||||
|
||||
trace!("importing slru file {}", path.display());
|
||||
info!("importing slru file {}", path.display());
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
@@ -284,14 +261,15 @@ fn import_slru_file(
|
||||
}
|
||||
|
||||
// TODO: UnexpectedEof is expected
|
||||
Err(err) => match err.kind() {
|
||||
Err(e) => match e.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
// FIXME: maybe check that we read the full length of the file?
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
bail!("error reading file {}: {:#}", path.display(), err);
|
||||
error!("error reading file: {:?} ({})", path, e);
|
||||
break;
|
||||
}
|
||||
},
|
||||
};
|
||||
@@ -303,119 +281,19 @@ fn import_slru_file(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Scan PostgreSQL WAL files in given directory and load all records between
|
||||
/// 'startpoint' and 'endpoint' into the repository.
|
||||
fn import_wal(
|
||||
walpath: &Path,
|
||||
timeline: &dyn TimelineWriter,
|
||||
startpoint: Lsn,
|
||||
endpoint: Lsn,
|
||||
checkpoint: &mut CheckPoint,
|
||||
) -> Result<()> {
|
||||
let mut waldecoder = WalStreamDecoder::new(startpoint);
|
||||
|
||||
let mut segno = startpoint.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = startpoint.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = startpoint;
|
||||
|
||||
while last_lsn <= endpoint {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut buf = Vec::new();
|
||||
|
||||
// Read local file
|
||||
let mut path = walpath.join(&filename);
|
||||
|
||||
// It could be as .partial
|
||||
if !PathBuf::from(&path).exists() {
|
||||
path = walpath.join(filename + ".partial");
|
||||
}
|
||||
|
||||
// Slurp the WAL file
|
||||
let mut file = File::open(&path)?;
|
||||
|
||||
if offset > 0 {
|
||||
file.seek(SeekFrom::Start(offset as u64))?;
|
||||
}
|
||||
|
||||
let nread = file.read_to_end(&mut buf)?;
|
||||
if nread != pg_constants::WAL_SEGMENT_SIZE - offset as usize {
|
||||
// Maybe allow this for .partial files?
|
||||
error!("read only {} bytes from WAL file", nread);
|
||||
}
|
||||
|
||||
waldecoder.feed_bytes(&buf);
|
||||
|
||||
let mut nrecords = 0;
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let mut checkpoint_modified = false;
|
||||
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
save_decoded_record(
|
||||
checkpoint,
|
||||
&mut checkpoint_modified,
|
||||
timeline,
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
last_lsn = lsn;
|
||||
|
||||
if checkpoint_modified {
|
||||
let checkpoint_bytes = checkpoint.encode();
|
||||
timeline.put_page_image(
|
||||
RelishTag::Checkpoint,
|
||||
0,
|
||||
last_lsn,
|
||||
checkpoint_bytes,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
timeline.advance_last_record_lsn(last_lsn);
|
||||
nrecords += 1;
|
||||
|
||||
trace!("imported record at {} (end {})", lsn, endpoint);
|
||||
}
|
||||
}
|
||||
|
||||
debug!("imported {} records up to {}", nrecords, last_lsn);
|
||||
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
if last_lsn != startpoint {
|
||||
debug!(
|
||||
"reached end of WAL at {}, updating checkpoint info",
|
||||
last_lsn
|
||||
);
|
||||
|
||||
timeline.advance_last_record_lsn(last_lsn);
|
||||
} else {
|
||||
info!("no WAL to import at {}", last_lsn);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
|
||||
/// relations/pages that the record affects.
|
||||
///
|
||||
pub fn save_decoded_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
decoded: &DecodedWALRecord,
|
||||
recdata: Bytes,
|
||||
lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
if checkpoint.update_next_xid(decoded.xl_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
checkpoint.update_next_xid(decoded.xl_xid);
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
@@ -426,43 +304,14 @@ pub fn save_decoded_record(
|
||||
forknum: blk.forknum as u8,
|
||||
});
|
||||
|
||||
//
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
if blk.apply_image
|
||||
&& blk.has_image
|
||||
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||
&& (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0
|
||||
{
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize);
|
||||
image.extend_from_slice(&recdata[img_offs..img_offs + img_len]);
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
|
||||
image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
|
||||
timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?;
|
||||
} else {
|
||||
let rec = WALRecord {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
|
||||
}
|
||||
timeline.put_wal_record(tag, blk.blkno, rec)?;
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
@@ -527,7 +376,7 @@ pub fn save_decoded_record(
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
let xlrec = XlClogTruncate::decode(&mut buf);
|
||||
save_clog_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
save_clog_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
@@ -596,17 +445,10 @@ pub fn save_decoded_record(
|
||||
)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
save_multixact_create_record(
|
||||
checkpoint,
|
||||
checkpoint_modified,
|
||||
timeline,
|
||||
lsn,
|
||||
&xlrec,
|
||||
decoded,
|
||||
)?;
|
||||
save_multixact_create_record(checkpoint, timeline, lsn, &xlrec, decoded)?;
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(&mut buf);
|
||||
save_multixact_truncate_record(checkpoint, checkpoint_modified, timeline, lsn, &xlrec)?;
|
||||
save_multixact_truncate_record(checkpoint, timeline, lsn, &xlrec)?;
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
|
||||
let xlrec = XlRelmapUpdate::decode(&mut buf);
|
||||
@@ -615,10 +457,7 @@ pub fn save_decoded_record(
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_NEXTOID {
|
||||
let next_oid = buf.get_u32_le();
|
||||
if checkpoint.nextOid != next_oid {
|
||||
checkpoint.nextOid = next_oid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
checkpoint.nextOid = next_oid;
|
||||
} else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
|
||||
|| info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
|
||||
{
|
||||
@@ -634,7 +473,6 @@ pub fn save_decoded_record(
|
||||
);
|
||||
if (checkpoint.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
|
||||
checkpoint.oldestXid = xlog_checkpoint.oldestXid;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -642,11 +480,7 @@ pub fn save_decoded_record(
|
||||
}
|
||||
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_DBASE_CREATE record.
|
||||
fn save_xlog_dbase_create(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlCreateDatabase,
|
||||
) -> Result<()> {
|
||||
fn save_xlog_dbase_create(timeline: &dyn Timeline, lsn: Lsn, rec: &XlCreateDatabase) -> Result<()> {
|
||||
let db_id = rec.db_id;
|
||||
let tablespace_id = rec.tablespace_id;
|
||||
let src_db_id = rec.src_db_id;
|
||||
@@ -723,11 +557,7 @@ fn save_xlog_dbase_create(
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_SMGR_TRUNCATE record.
|
||||
///
|
||||
/// This is the same logic as in PostgreSQL's smgr_redo() function.
|
||||
fn save_xlog_smgr_truncate(
|
||||
timeline: &dyn TimelineWriter,
|
||||
lsn: Lsn,
|
||||
rec: &XlSmgrTruncate,
|
||||
) -> Result<()> {
|
||||
fn save_xlog_smgr_truncate(timeline: &dyn Timeline, lsn: Lsn, rec: &XlSmgrTruncate) -> Result<()> {
|
||||
let spcnode = rec.rnode.spcnode;
|
||||
let dbnode = rec.rnode.dbnode;
|
||||
let relnode = rec.rnode.relnode;
|
||||
@@ -789,7 +619,7 @@ fn save_xlog_smgr_truncate(
|
||||
/// Subroutine of save_decoded_record(), to handle an XLOG_XACT_* records.
|
||||
///
|
||||
fn save_xact_record(
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
parsed: &XlXactParsedRecord,
|
||||
decoded: &DecodedWALRecord,
|
||||
@@ -800,12 +630,12 @@ fn save_xact_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
@@ -821,7 +651,6 @@ fn save_xact_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::Clog,
|
||||
segno,
|
||||
@@ -847,8 +676,7 @@ fn save_xact_record(
|
||||
|
||||
fn save_clog_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlClogTruncate,
|
||||
) -> Result<()> {
|
||||
@@ -866,7 +694,6 @@ fn save_clog_truncate_record(
|
||||
// TODO Figure out if there will be any issues with replica.
|
||||
checkpoint.oldestXid = xlrec.oldest_xid;
|
||||
checkpoint.oldestXidDB = xlrec.oldest_xid_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// TODO Treat AdvanceOldestClogXid() or write a comment why we don't need it
|
||||
|
||||
@@ -909,13 +736,13 @@ fn save_clog_truncate_record(
|
||||
|
||||
fn save_multixact_create_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactCreate,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> Result<()> {
|
||||
let rec = WALRecord {
|
||||
lsn,
|
||||
will_init: false,
|
||||
rec: decoded.record.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
@@ -924,7 +751,6 @@ fn save_multixact_create_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactOffsets,
|
||||
segno,
|
||||
@@ -944,7 +770,6 @@ fn save_multixact_create_record(
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
timeline.put_wal_record(
|
||||
lsn,
|
||||
RelishTag::Slru {
|
||||
slru: SlruKind::MultiXactMembers,
|
||||
segno,
|
||||
@@ -967,11 +792,9 @@ fn save_multixact_create_record(
|
||||
}
|
||||
if xlrec.mid >= checkpoint.nextMulti {
|
||||
checkpoint.nextMulti = xlrec.mid + 1;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
if xlrec.moff + xlrec.nmembers > checkpoint.nextMultiOffset {
|
||||
checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
let max_mbr_xid = xlrec.members.iter().fold(0u32, |acc, mbr| {
|
||||
if mbr.xid.wrapping_sub(acc) as i32 > 0 {
|
||||
@@ -981,22 +804,18 @@ fn save_multixact_create_record(
|
||||
}
|
||||
});
|
||||
|
||||
if checkpoint.update_next_xid(max_mbr_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
checkpoint.update_next_xid(max_mbr_xid);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn save_multixact_truncate_record(
|
||||
checkpoint: &mut CheckPoint,
|
||||
checkpoint_modified: &mut bool,
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlMultiXactTruncate,
|
||||
) -> Result<()> {
|
||||
checkpoint.oldestMulti = xlrec.end_trunc_off;
|
||||
checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
|
||||
*checkpoint_modified = true;
|
||||
|
||||
// PerformMembersTruncation
|
||||
let maxsegment: i32 = mx_offset_to_member_segment(pg_constants::MAX_MULTIXACT_OFFSET);
|
||||
@@ -1030,7 +849,7 @@ fn save_multixact_truncate_record(
|
||||
}
|
||||
|
||||
fn save_relmap_page(
|
||||
timeline: &dyn TimelineWriter,
|
||||
timeline: &dyn Timeline,
|
||||
lsn: Lsn,
|
||||
xlrec: &XlRelmapUpdate,
|
||||
decoded: &DecodedWALRecord,
|
||||
|
||||
@@ -4,267 +4,74 @@
|
||||
use crate::branches;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::repository::{Repository, Timeline};
|
||||
use crate::tenant_threads;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use log::info;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
lazy_static! {
|
||||
static ref TENANTS: Mutex<HashMap<ZTenantId, Tenant>> = Mutex::new(HashMap::new());
|
||||
pub static ref REPOSITORY: Mutex<HashMap<ZTenantId, Arc<dyn Repository>>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
struct Tenant {
|
||||
state: TenantState,
|
||||
repo: Option<Arc<dyn Repository>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TenantState {
|
||||
// This tenant only exists in cloud storage. It cannot be accessed.
|
||||
CloudOnly,
|
||||
// This tenant exists in cloud storage, and we are currently downloading it to local disk.
|
||||
// It cannot be accessed yet, not until it's been fully downloaded to local disk.
|
||||
Downloading,
|
||||
// All data for this tenant is complete on local disk, but we haven't loaded the Repository,
|
||||
// Timeline and Layer structs into memory yet, so it cannot be accessed yet.
|
||||
//Ready,
|
||||
// This tenant exists on local disk, and the layer map has been loaded into memory.
|
||||
// The local disk might have some newer files that don't exist in cloud storage yet.
|
||||
Active,
|
||||
// Tenant is active, but there is no walreceiver connection.
|
||||
Idle,
|
||||
// This tenant exists on local disk, and the layer map has been loaded into memory.
|
||||
// The local disk might have some newer files that don't exist in cloud storage yet.
|
||||
// The tenant cannot be accessed anymore for any reason, but graceful shutdown.
|
||||
Stopping,
|
||||
}
|
||||
|
||||
/// A remote storage timeline synchronization event, that needs another step
|
||||
/// to be fully completed.
|
||||
#[derive(Debug)]
|
||||
pub enum PostTimelineSyncStep {
|
||||
/// The timeline cannot be synchronized anymore due to some sync issues.
|
||||
/// Needs to be removed from pageserver, to avoid further data diverging.
|
||||
Evict,
|
||||
/// A new timeline got downloaded and needs to be loaded into pageserver.
|
||||
RegisterDownload,
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantState {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
TenantState::CloudOnly => f.write_str("CloudOnly"),
|
||||
TenantState::Downloading => f.write_str("Downloading"),
|
||||
TenantState::Active => f.write_str("Active"),
|
||||
TenantState::Idle => f.write_str("Idle"),
|
||||
TenantState::Stopping => f.write_str("Stopping"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
TENANTS.lock().unwrap()
|
||||
}
|
||||
|
||||
static SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
|
||||
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
for dir_entry in fs::read_dir(conf.tenants_path()).unwrap() {
|
||||
let tenantid =
|
||||
ZTenantId::from_str(dir_entry.unwrap().file_name().to_str().unwrap()).unwrap();
|
||||
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
let tenant = Tenant {
|
||||
state: TenantState::CloudOnly,
|
||||
repo: None,
|
||||
};
|
||||
m.insert(tenantid, tenant);
|
||||
}
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenantid);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenantid,
|
||||
));
|
||||
LayeredRepository::launch_checkpointer_thread(conf, repo.clone());
|
||||
LayeredRepository::launch_gc_thread(conf, repo.clone());
|
||||
|
||||
init_repo(conf, tenantid);
|
||||
info!("initialized storage for tenant: {}", &tenantid);
|
||||
m.insert(tenantid, repo);
|
||||
}
|
||||
}
|
||||
|
||||
fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
false,
|
||||
));
|
||||
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenant_id).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
}
|
||||
|
||||
pub fn perform_post_timeline_sync_steps(
|
||||
conf: &'static PageServerConf,
|
||||
post_sync_steps: HashMap<(ZTenantId, ZTimelineId), PostTimelineSyncStep>,
|
||||
) {
|
||||
if post_sync_steps.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
info!("Performing {} post-sync steps", post_sync_steps.len());
|
||||
trace!("Steps: {:?}", post_sync_steps);
|
||||
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
for &(tenant_id, timeline_id) in post_sync_steps.keys() {
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
|
||||
state: TenantState::Downloading,
|
||||
repo: None,
|
||||
});
|
||||
tenant.state = TenantState::Downloading;
|
||||
match &tenant.repo {
|
||||
Some(repo) => {
|
||||
init_timeline(repo.as_ref(), timeline_id);
|
||||
tenant.state = TenantState::Idle;
|
||||
return;
|
||||
}
|
||||
None => log::warn!("Initialize new repo"),
|
||||
}
|
||||
tenant.state = TenantState::Idle;
|
||||
}
|
||||
}
|
||||
|
||||
for ((tenant_id, timeline_id), post_sync_step) in post_sync_steps {
|
||||
match post_sync_step {
|
||||
PostTimelineSyncStep::Evict => {
|
||||
if let Err(e) = get_repository_for_tenant(tenant_id)
|
||||
.and_then(|repo| repo.unload_timeline(timeline_id))
|
||||
{
|
||||
error!(
|
||||
"Failed to remove repository for tenant {}, timeline {}: {:#}",
|
||||
tenant_id, timeline_id, e
|
||||
)
|
||||
}
|
||||
}
|
||||
PostTimelineSyncStep::RegisterDownload => {
|
||||
// init repo updates Tenant state
|
||||
init_repo(conf, tenant_id);
|
||||
let new_repo = get_repository_for_tenant(tenant_id).unwrap();
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
match repo.get_timeline(timeline_id) {
|
||||
Ok(_timeline) => log::info!("Successfully initialized timeline {}", timeline_id),
|
||||
Err(e) => log::error!("Failed to init timeline {}, reason: {:#}", timeline_id, e),
|
||||
}
|
||||
}
|
||||
|
||||
// Check this flag in the thread loops to know when to exit
|
||||
pub fn shutdown_requested() -> bool {
|
||||
SHUTDOWN_REQUESTED.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
pub fn shutdown_all_tenants() -> Result<()> {
|
||||
SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
|
||||
|
||||
let tenantids = list_tenantids()?;
|
||||
|
||||
for tenantid in &tenantids {
|
||||
set_tenant_state(*tenantid, TenantState::Stopping)?;
|
||||
}
|
||||
|
||||
for tenantid in tenantids {
|
||||
// Wait for checkpointer and GC to finish their job
|
||||
tenant_threads::wait_for_tenant_threads_to_stop(tenantid);
|
||||
|
||||
let repo = get_repository_for_tenant(tenantid)?;
|
||||
debug!("shutdown tenant {}", tenantid);
|
||||
repo.shutdown()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn create_repository_for_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<()> {
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
bail!("tenant {} already exists", tenantid);
|
||||
}
|
||||
let tenant = Tenant {
|
||||
state: TenantState::CloudOnly,
|
||||
repo: None,
|
||||
};
|
||||
m.insert(tenantid, tenant);
|
||||
}
|
||||
let mut m = REPOSITORY.lock().unwrap();
|
||||
|
||||
// First check that the tenant doesn't exist already
|
||||
if m.get(&tenantid).is_some() {
|
||||
bail!("tenant {} already exists", tenantid);
|
||||
}
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
||||
let repo = branches::create_repo(conf, tenantid, wal_redo_manager)?;
|
||||
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
m.insert(tenantid, repo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// If tenant is not found in the repository, return CloudOnly state
|
||||
pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
|
||||
let m = access_tenants();
|
||||
match m.get(&tenantid) {
|
||||
Some(tenant) => tenant.state,
|
||||
None => TenantState::CloudOnly,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid);
|
||||
|
||||
match tenant {
|
||||
Some(tenant) => {
|
||||
if newstate == TenantState::Idle && tenant.state != TenantState::Active {
|
||||
// Only Active tenant can become Idle
|
||||
return Ok(tenant.state);
|
||||
}
|
||||
info!("set_tenant_state: {} -> {}", tenant.state, newstate);
|
||||
tenant.state = newstate;
|
||||
Ok(tenant.state)
|
||||
}
|
||||
None => bail!("Tenant not found for tenant {}", tenantid),
|
||||
}
|
||||
pub fn insert_repository_for_tenant(tenantid: ZTenantId, repo: Arc<dyn Repository>) {
|
||||
let o = &mut REPOSITORY.lock().unwrap();
|
||||
o.insert(tenantid, repo);
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<dyn Repository>> {
|
||||
let m = access_tenants();
|
||||
let tenant = m
|
||||
.get(&tenantid)
|
||||
.ok_or_else(|| anyhow!("Tenant not found for tenant {}", tenantid))?;
|
||||
|
||||
match &tenant.repo {
|
||||
Some(repo) => Ok(Arc::clone(repo)),
|
||||
None => anyhow::bail!("Repository for tenant {} is not yet valid", tenantid),
|
||||
}
|
||||
let o = &REPOSITORY.lock().unwrap();
|
||||
o.get(&tenantid)
|
||||
.map(|repo| Arc::clone(repo))
|
||||
.ok_or_else(|| anyhow!("repository not found for tenant name {}", tenantid))
|
||||
}
|
||||
|
||||
pub fn get_timeline_for_tenant(
|
||||
@@ -275,33 +82,3 @@ pub fn get_timeline_for_tenant(
|
||||
.get_timeline(timelineid)
|
||||
.with_context(|| format!("cannot fetch timeline {}", timelineid))
|
||||
}
|
||||
|
||||
fn list_tenantids() -> Result<Vec<ZTenantId>> {
|
||||
let m = access_tenants();
|
||||
m.iter()
|
||||
.map(|v| {
|
||||
let (tenantid, _) = v;
|
||||
Ok(*tenantid)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct TenantInfo {
|
||||
#[serde(with = "hex")]
|
||||
pub id: ZTenantId,
|
||||
pub state: TenantState,
|
||||
}
|
||||
|
||||
pub fn list_tenants() -> Result<Vec<TenantInfo>> {
|
||||
let m = access_tenants();
|
||||
m.iter()
|
||||
.map(|v| {
|
||||
let (id, tenant) = v;
|
||||
Ok(TenantInfo {
|
||||
id: *id,
|
||||
state: tenant.state,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as checkpointer and GC
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::CheckpointConfig;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Mutex;
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
struct TenantHandleEntry {
|
||||
checkpointer_handle: Option<JoinHandle<()>>,
|
||||
gc_handle: Option<JoinHandle<()>>,
|
||||
}
|
||||
|
||||
// Preserve handles to wait for thread completion
|
||||
// at shutdown
|
||||
lazy_static! {
|
||||
static ref TENANT_HANDLES: Mutex<HashMap<ZTenantId, TenantHandleEntry>> =
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref TENANT_THREADS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"tenant_threads_count",
|
||||
"Number of live tenant threads",
|
||||
&["tenant_thread_type"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
// Launch checkpointer and GC for the tenant.
|
||||
// It's possible that the threads are running already,
|
||||
// if so, just don't spawn new ones.
|
||||
pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
let h = handles
|
||||
.entry(tenantid)
|
||||
.or_insert_with(|| TenantHandleEntry {
|
||||
checkpointer_handle: None,
|
||||
gc_handle: None,
|
||||
});
|
||||
|
||||
if h.checkpointer_handle.is_none() {
|
||||
h.checkpointer_handle = std::thread::Builder::new()
|
||||
.name("Checkpointer thread".into())
|
||||
.spawn(move || {
|
||||
checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
|
||||
if h.gc_handle.is_none() {
|
||||
h.gc_handle = std::thread::Builder::new()
|
||||
.name("GC thread".into())
|
||||
.spawn(move || {
|
||||
gc_loop(tenantid, conf).expect("GC thread died");
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
if let Some(h) = handles.get_mut(&tenantid) {
|
||||
h.checkpointer_handle.take().map(JoinHandle::join);
|
||||
trace!("checkpointer for tenant {} has stopped", tenantid);
|
||||
h.gc_handle.take().map(JoinHandle::join);
|
||||
trace!("gc for tenant {} has stopped", tenantid);
|
||||
}
|
||||
handles.remove(&tenantid);
|
||||
}
|
||||
|
||||
///
|
||||
/// Checkpointer thread's main loop
|
||||
///
|
||||
fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
let gauge = TENANT_THREADS_COUNT.with_label_values(&["checkpointer"]);
|
||||
gauge.inc();
|
||||
scopeguard::defer! {
|
||||
gauge.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
break;
|
||||
}
|
||||
|
||||
std::thread::sleep(conf.checkpoint_period);
|
||||
trace!("checkpointer thread for tenant {} waking up", tenantid);
|
||||
|
||||
// checkpoint timelines that have accumulated more than CHECKPOINT_DISTANCE
|
||||
// bytes of WAL since last checkpoint.
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
repo.checkpoint_iteration(CheckpointConfig::Distance(conf.checkpoint_distance))?;
|
||||
}
|
||||
|
||||
trace!(
|
||||
"checkpointer thread stopped for tenant {} state is {}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// GC thread's main loop
|
||||
///
|
||||
fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
let gauge = TENANT_THREADS_COUNT.with_label_values(&["gc"]);
|
||||
gauge.inc();
|
||||
scopeguard::defer! {
|
||||
gauge.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
break;
|
||||
}
|
||||
|
||||
trace!("gc thread for tenant {} waking up", tenantid);
|
||||
|
||||
// Garbage collect old files that are not needed for PITR anymore
|
||||
if conf.gc_horizon > 0 {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
repo.gc_iteration(None, conf.gc_horizon, false).unwrap();
|
||||
}
|
||||
|
||||
// TODO Write it in more adequate way using
|
||||
// condvar.wait_timeout() or something
|
||||
let mut sleep_time = conf.gc_period.as_secs();
|
||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == TenantState::Active {
|
||||
sleep_time -= 1;
|
||||
std::thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
trace!(
|
||||
"GC thread stopped for tenant {} state is {}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,619 +0,0 @@
|
||||
//!
|
||||
//! VirtualFile is like a normal File, but it's not bound directly to
|
||||
//! a file descriptor. Instead, the file is opened when it's read from,
|
||||
//! and if too many files are open globally in the system, least-recently
|
||||
//! used ones are closed.
|
||||
//!
|
||||
//! To track which files have been recently used, we use the clock algorithm
|
||||
//! with a 'recently_used' flag on each slot.
|
||||
//!
|
||||
//! This is similar to PostgreSQL's virtual file descriptor facility in
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
/// the underlying file is closed if the system is low on file descriptors,
|
||||
/// and re-opened when it's accessed again.
|
||||
///
|
||||
/// Like with std::fs::File, multiple threads can read/write the file concurrently,
|
||||
/// holding just a shared reference the same VirtualFile, using the read_at() / write_at()
|
||||
/// functions from the FileExt trait. But the functions from the Read/Write/Seek traits
|
||||
/// require a mutable reference, because they modify the "current position".
|
||||
///
|
||||
/// Each VirtualFile has a physical file descriptor in the global OPEN_FILES array, at the
|
||||
/// slot that 'handle points to, if the underlying file is currently open. If it's not
|
||||
/// currently open, the 'handle' can still point to the slot where it was last kept. The
|
||||
/// 'tag' field is used to detect whether the handle still is valid or not.
|
||||
///
|
||||
pub struct VirtualFile {
|
||||
/// Lazy handle to the global file descriptor cache. The slot that this points to
|
||||
/// might contain our File, or it may be empty, or it may contain a File that
|
||||
/// belongs to a different VirtualFile.
|
||||
handle: RwLock<SlotHandle>,
|
||||
|
||||
/// Current file position
|
||||
pos: u64,
|
||||
|
||||
/// File path and options to use to open it.
|
||||
///
|
||||
/// Note: this only contains the options needed to re-open it. For example,
|
||||
/// if a new file is created, we only pass the create flag when it's initially
|
||||
/// opened, in the VirtualFile::create() function, and strip the flag before
|
||||
/// storing it here.
|
||||
pub path: PathBuf,
|
||||
open_options: OpenOptions,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Clone, Copy)]
|
||||
struct SlotHandle {
|
||||
/// Index into OPEN_FILES.slots
|
||||
index: usize,
|
||||
|
||||
/// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has
|
||||
/// been recycled and no longer contains the FD for this virtual file.
|
||||
tag: u64,
|
||||
}
|
||||
|
||||
/// OPEN_FILES is the global array that holds the physical file descriptors that
|
||||
/// are currently open. Each slot in the array is protected by a separate lock,
|
||||
/// so that different files can be accessed independently. The lock must be held
|
||||
/// in write mode to replace the slot with a different file, but a read mode
|
||||
/// is enough to operate on the file, whether you're reading or writing to it.
|
||||
///
|
||||
/// OPEN_FILES starts in uninitialized state, and it's initialized by
|
||||
/// the virtual_file::init() function. It must be called exactly once at page
|
||||
/// server startup.
|
||||
static OPEN_FILES: OnceCell<OpenFiles> = OnceCell::new();
|
||||
|
||||
struct OpenFiles {
|
||||
slots: &'static [Slot],
|
||||
|
||||
/// clock arm for the clock algorithm
|
||||
next: AtomicUsize,
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
inner: RwLock<SlotInner>,
|
||||
|
||||
/// has this file been used since last clock sweep?
|
||||
recently_used: AtomicBool,
|
||||
}
|
||||
|
||||
struct SlotInner {
|
||||
/// Counter that's incremented every time a different file is stored here.
|
||||
/// To avoid the ABA problem.
|
||||
tag: u64,
|
||||
|
||||
/// the underlying file
|
||||
file: Option<File>,
|
||||
}
|
||||
|
||||
impl OpenFiles {
|
||||
/// Find a slot to use, evicting an existing file descriptor if needed.
|
||||
///
|
||||
/// On return, we hold a lock on the slot, and its 'tag' has been updated
|
||||
/// recently_used has been set. It's all ready for reuse.
|
||||
fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
|
||||
//
|
||||
// Run the clock algorithm to find a slot to replace.
|
||||
//
|
||||
let num_slots = self.slots.len();
|
||||
let mut retries = 0;
|
||||
let mut slot;
|
||||
let mut slot_guard;
|
||||
let index;
|
||||
loop {
|
||||
let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots;
|
||||
slot = &self.slots[next];
|
||||
|
||||
// If the recently_used flag on this slot is set, continue the clock
|
||||
// sweep. Otherwise try to use this slot. If we cannot acquire the
|
||||
// lock, also continue the clock sweep.
|
||||
//
|
||||
// We only continue in this manner for a while, though. If we loop
|
||||
// through the array twice without finding a victim, just pick the
|
||||
// next slot and wait until we can reuse it. This way, we avoid
|
||||
// spinning in the extreme case that all the slots are busy with an
|
||||
// I/O operation.
|
||||
if retries < num_slots * 2 {
|
||||
if !slot.recently_used.swap(false, Ordering::Release) {
|
||||
if let Ok(guard) = slot.inner.try_write() {
|
||||
slot_guard = guard;
|
||||
index = next;
|
||||
break;
|
||||
}
|
||||
}
|
||||
retries += 1;
|
||||
} else {
|
||||
slot_guard = slot.inner.write().unwrap();
|
||||
index = next;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// We now have the victim slot locked. If it was in use previously, close the
|
||||
// old file.
|
||||
//
|
||||
if let Some(old_file) = slot_guard.file.take() {
|
||||
drop(old_file);
|
||||
}
|
||||
|
||||
// Prepare the slot for reuse and return it
|
||||
slot_guard.tag += 1;
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
(
|
||||
SlotHandle {
|
||||
index,
|
||||
tag: slot_guard.tag,
|
||||
},
|
||||
slot_guard,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
/// Open a file in read-only mode. Like File::open.
|
||||
pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
Self::open_with_options(path, OpenOptions::new().read(true))
|
||||
}
|
||||
|
||||
/// Create a new file for writing. If the file exists, it will be truncated.
|
||||
/// Like File::create.
|
||||
pub fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
Self::open_with_options(
|
||||
path,
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
)
|
||||
}
|
||||
|
||||
/// Open a file with given options.
|
||||
///
|
||||
/// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
|
||||
/// they will be applied also when the file is subsequently re-opened, not only
|
||||
/// on the first time. Make sure that's sane!
|
||||
pub fn open_with_options(
|
||||
path: &Path,
|
||||
open_options: &OpenOptions,
|
||||
) -> Result<VirtualFile, std::io::Error> {
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
|
||||
|
||||
let file = open_options.open(path)?;
|
||||
|
||||
// Strip all options other than read and write.
|
||||
//
|
||||
// It would perhaps be nicer to check just for the read and write flags
|
||||
// explicitly, but OpenOptions doesn't contain any functions to read flags,
|
||||
// only to set them.
|
||||
let mut reopen_options = open_options.clone();
|
||||
reopen_options.create(false);
|
||||
reopen_options.create_new(false);
|
||||
reopen_options.truncate(false);
|
||||
|
||||
let vfile = VirtualFile {
|
||||
handle: RwLock::new(handle),
|
||||
pos: 0,
|
||||
path: path.to_path_buf(),
|
||||
open_options: reopen_options,
|
||||
};
|
||||
|
||||
slot_guard.file.replace(file);
|
||||
|
||||
Ok(vfile)
|
||||
}
|
||||
|
||||
/// Call File::sync_all() on the underlying File.
|
||||
pub fn sync_all(&self) -> Result<(), Error> {
|
||||
self.with_file(|file| file.sync_all())?
|
||||
}
|
||||
|
||||
/// Helper function that looks up the underlying File for this VirtualFile,
|
||||
/// opening it and evicting some other File if necessary. It calls 'func'
|
||||
/// with the physical File.
|
||||
fn with_file<F, R>(&self, mut func: F) -> Result<R, Error>
|
||||
where
|
||||
F: FnMut(&File) -> R,
|
||||
{
|
||||
let open_files = get_open_files();
|
||||
|
||||
let mut handle_guard = {
|
||||
// Read the cached slot handle, and see if the slot that it points to still
|
||||
// contains our File.
|
||||
//
|
||||
// We only need to hold the handle lock while we read the current handle. If
|
||||
// another thread closes the file and recycles the slot for a different file,
|
||||
// we will notice that the handle we read is no longer valid and retry.
|
||||
let mut handle = *self.handle.read().unwrap();
|
||||
loop {
|
||||
// Check if the slot contains our File
|
||||
{
|
||||
let slot = &open_files.slots[handle.index];
|
||||
let slot_guard = slot.inner.read().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
if let Some(file) = &slot_guard.file {
|
||||
// Found a cached file descriptor.
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
return Ok(func(file));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The slot didn't contain our File. We will have to open it ourselves,
|
||||
// but before that, grab a write lock on handle in the VirtualFile, so
|
||||
// that no other thread will try to concurrently open the same file.
|
||||
let handle_guard = self.handle.write().unwrap();
|
||||
|
||||
// If another thread changed the handle while we were not holding the lock,
|
||||
// then the handle might now be valid again. Loop back to retry.
|
||||
if *handle_guard != handle {
|
||||
handle = *handle_guard;
|
||||
continue;
|
||||
}
|
||||
break handle_guard;
|
||||
}
|
||||
};
|
||||
|
||||
// We need to open the file ourselves. The handle in the VirtualFile is
|
||||
// now locked in write-mode. Find a free slot to put it in.
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot();
|
||||
|
||||
// Open the physical file
|
||||
let file = self.open_options.open(&self.path)?;
|
||||
|
||||
// Perform the requested operation on it
|
||||
//
|
||||
// TODO: We could downgrade the locks to read mode before calling
|
||||
// 'func', to allow a little bit more concurrency, but the standard
|
||||
// library RwLock doesn't allow downgrading without releasing the lock,
|
||||
// and that doesn't seem worth the trouble. (parking_lot RwLock would
|
||||
// allow it)
|
||||
let result = func(&file);
|
||||
|
||||
// Store the File in the slot and update the handle in the VirtualFile
|
||||
// to point to it.
|
||||
slot_guard.file.replace(file);
|
||||
|
||||
*handle_guard = handle;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VirtualFile {
|
||||
/// If a VirtualFile is dropped, close the underlying file if it was open.
|
||||
fn drop(&mut self) {
|
||||
let handle = self.handle.get_mut().unwrap();
|
||||
|
||||
// We could check with a read-lock first, to avoid waiting on an
|
||||
// unrelated I/O.
|
||||
let slot = &get_open_files().slots[handle.index];
|
||||
let mut slot_guard = slot.inner.write().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
slot.recently_used.store(false, Ordering::Relaxed);
|
||||
slot_guard.file.take();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for VirtualFile {
|
||||
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
|
||||
let pos = self.pos;
|
||||
let n = self.read_at(buf, pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for VirtualFile {
|
||||
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||
let pos = self.pos;
|
||||
let n = self.write_at(buf, pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> Result<(), std::io::Error> {
|
||||
// flush is no-op for File (at least on unix), so we don't need to do
|
||||
// anything here either.
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for VirtualFile {
|
||||
fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(offset) => {
|
||||
self.pos = offset;
|
||||
}
|
||||
SeekFrom::End(offset) => {
|
||||
self.pos = self.with_file(|mut file| file.seek(SeekFrom::End(offset)))??
|
||||
}
|
||||
SeekFrom::Current(offset) => {
|
||||
let pos = self.pos as i128 + offset as i128;
|
||||
if pos < 0 {
|
||||
return Err(Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
"offset would be negative",
|
||||
));
|
||||
}
|
||||
if pos > u64::MAX as i128 {
|
||||
return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
|
||||
}
|
||||
self.pos = pos as u64;
|
||||
}
|
||||
}
|
||||
Ok(self.pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl FileExt for VirtualFile {
|
||||
fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
self.with_file(|file| file.read_at(buf, offset))?
|
||||
}
|
||||
|
||||
fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
self.with_file(|file| file.write_at(buf, offset))?
|
||||
}
|
||||
}
|
||||
|
||||
impl OpenFiles {
|
||||
fn new(num_slots: usize) -> OpenFiles {
|
||||
let mut slots = Box::new(Vec::with_capacity(num_slots));
|
||||
for _ in 0..num_slots {
|
||||
let slot = Slot {
|
||||
recently_used: AtomicBool::new(false),
|
||||
inner: RwLock::new(SlotInner { tag: 0, file: None }),
|
||||
};
|
||||
slots.push(slot);
|
||||
}
|
||||
|
||||
OpenFiles {
|
||||
next: AtomicUsize::new(0),
|
||||
slots: Box::leak(slots),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize the virtual file module. This must be called once at page
|
||||
/// server startup.
|
||||
///
|
||||
pub fn init(num_slots: usize) {
|
||||
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
}
|
||||
|
||||
const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
|
||||
|
||||
// Get a handle to the global slots array.
|
||||
fn get_open_files() -> &'static OpenFiles {
|
||||
//
|
||||
// In unit tests, page server startup doesn't happen and no one calls
|
||||
// virtual_file::init(). Initialize it here, with a small array.
|
||||
//
|
||||
// This applies to the virtual file tests below, but all other unit
|
||||
// tests too, so the virtual file facility is always usable in
|
||||
// unit tests.
|
||||
//
|
||||
if cfg!(test) {
|
||||
OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS))
|
||||
} else {
|
||||
OPEN_FILES.get().expect("virtual_file::init not called yet")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
// into a string
|
||||
fn read_string<FD>(vfile: &mut FD) -> Result<String, Error>
|
||||
where
|
||||
FD: Read,
|
||||
{
|
||||
let mut buf = String::new();
|
||||
vfile.read_to_string(&mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
// Helper function to slurp a portion of a file into a string
|
||||
fn read_string_at<FD>(vfile: &mut FD, pos: u64, len: usize) -> Result<String, Error>
|
||||
where
|
||||
FD: FileExt,
|
||||
{
|
||||
let mut buf = Vec::new();
|
||||
buf.resize(len, 0);
|
||||
vfile.read_exact_at(&mut buf, pos)?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_virtual_files() -> Result<(), Error> {
|
||||
// The real work is done in the test_files() helper function. This
|
||||
// allows us to run the same set of tests against a native File, and
|
||||
// VirtualFile. We trust the native Files and wouldn't need to test them,
|
||||
// but this allows us to verify that the operations return the same
|
||||
// results with VirtualFiles as with native Files. (Except that with
|
||||
// native files, you will run out of file descriptors if the ulimit
|
||||
// is low enough.)
|
||||
test_files("virtual_files", |path, open_options| {
|
||||
VirtualFile::open_with_options(path, open_options)
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_physical_files() -> Result<(), Error> {
|
||||
test_files("physical_files", |path, open_options| {
|
||||
open_options.open(path)
|
||||
})
|
||||
}
|
||||
|
||||
fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
|
||||
where
|
||||
FD: Read + Write + Seek + FileExt,
|
||||
OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
|
||||
{
|
||||
let testdir = crate::PageServerConf::test_repo_dir(testname);
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
let path_a = testdir.join("file_a");
|
||||
let mut file_a = openfunc(
|
||||
&path_a,
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
)?;
|
||||
file_a.write_all(b"foobar")?;
|
||||
|
||||
// cannot read from a file opened in write-only mode
|
||||
assert!(read_string(&mut file_a).is_err());
|
||||
|
||||
// Close the file and re-open for reading
|
||||
let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
assert!(file_a.write(b"bar").is_err());
|
||||
|
||||
// Try simple read
|
||||
assert_eq!("foobar", read_string(&mut file_a)?);
|
||||
|
||||
// It's positioned at the EOF now.
|
||||
assert_eq!("", read_string(&mut file_a)?);
|
||||
|
||||
// Test seeks.
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
|
||||
assert_eq!("ar", read_string(&mut file_a)?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
|
||||
assert_eq!("bar", read_string(&mut file_a)?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
// Test erroneous seeks to before byte 0
|
||||
assert!(file_a.seek(SeekFrom::End(-7)).is_err());
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
|
||||
|
||||
// the erroneous seek should have left the position unchanged
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
// Create another test file, and try FileExt functions on it.
|
||||
let path_b = testdir.join("file_b");
|
||||
let mut file_b = openfunc(
|
||||
&path_b,
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true),
|
||||
)?;
|
||||
file_b.write_all_at(b"BAR", 3)?;
|
||||
file_b.write_all_at(b"FOO", 0)?;
|
||||
|
||||
assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA");
|
||||
|
||||
// Open a lot of files, enough to cause some evictions. (Or to be precise,
|
||||
// open the same file many times. The effect is the same.)
|
||||
//
|
||||
// leave file_a positioned at offset 1 before we start
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
|
||||
let mut vfiles = Vec::new();
|
||||
for _ in 0..100 {
|
||||
let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
|
||||
assert_eq!("FOOBAR", read_string(&mut vfile)?);
|
||||
vfiles.push(vfile);
|
||||
}
|
||||
|
||||
// make sure we opened enough files to definitely cause evictions.
|
||||
assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
|
||||
|
||||
// The underlying file descriptor for 'file_a' should be closed now. Try to read
|
||||
// from it again. We left the file positioned at offset 1 above.
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
// Check that all the other FDs still work too. Use them in random order for
|
||||
// good measure.
|
||||
vfiles.as_mut_slice().shuffle(&mut thread_rng());
|
||||
for vfile in vfiles.iter_mut() {
|
||||
assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test using VirtualFiles from many threads concurrently. This tests both using
|
||||
/// a lot of VirtualFiles concurrently, causing evictions, and also using the same
|
||||
/// VirtualFile from multiple threads concurrently.
|
||||
#[test]
|
||||
fn test_vfile_concurrency() -> Result<(), Error> {
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const VIRTUAL_FILES: usize = 100;
|
||||
const THREADS: usize = 100;
|
||||
const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
|
||||
|
||||
let testdir = crate::PageServerConf::test_repo_dir("vfile_concurrency");
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
// Create a test file.
|
||||
let test_file_path = testdir.join("concurrency_test_file");
|
||||
{
|
||||
let file = File::create(&test_file_path)?;
|
||||
file.write_all_at(&SAMPLE, 0)?;
|
||||
}
|
||||
|
||||
// Open the file many times.
|
||||
let mut files = Vec::new();
|
||||
for _ in 0..VIRTUAL_FILES {
|
||||
let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
|
||||
files.push(f);
|
||||
}
|
||||
let files = Arc::new(files);
|
||||
|
||||
// Launch many threads, and use the virtual files concurrently in random order.
|
||||
let mut threads = Vec::new();
|
||||
for threadno in 0..THREADS {
|
||||
let builder =
|
||||
thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
|
||||
|
||||
let files = files.clone();
|
||||
let thread = builder
|
||||
.spawn(move || {
|
||||
let mut buf = [0u8; SIZE];
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 1..1000 {
|
||||
let f = &files[rng.gen_range(0..files.len())];
|
||||
f.read_exact_at(&mut buf, 0).unwrap();
|
||||
assert!(buf == SAMPLE);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(thread);
|
||||
}
|
||||
|
||||
for thread in threads {
|
||||
thread.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -72,10 +72,6 @@ impl WalStreamDecoder {
|
||||
/// Err(WalDecodeError): an error occured while decoding, meaning the input was invalid.
|
||||
///
|
||||
pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
|
||||
let recordbuf;
|
||||
|
||||
// Run state machine that validates page headers, and reassembles records
|
||||
// that cross page boundaries.
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
@@ -124,41 +120,29 @@ impl WalStreamDecoder {
|
||||
self.lsn += self.padlen as u64;
|
||||
self.padlen = 0;
|
||||
} else if self.contlen == 0 {
|
||||
assert!(self.recordbuf.is_empty());
|
||||
|
||||
// need to have at least the xl_tot_len field
|
||||
|
||||
if self.inputbuf.remaining() < 4 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// peek xl_tot_len at the beginning of the record.
|
||||
// FIXME: assumes little-endian
|
||||
// read xl_tot_len FIXME: assumes little-endian
|
||||
self.startlsn = self.lsn;
|
||||
let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
|
||||
let xl_tot_len = self.inputbuf.get_u32_le();
|
||||
if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
|
||||
return Err(WalDecodeError {
|
||||
msg: format!("invalid xl_tot_len {}", xl_tot_len),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
self.lsn += 4;
|
||||
|
||||
// Fast path for the common case that the whole record fits on the page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
|
||||
// Take the record from the 'inputbuf', and validate it.
|
||||
recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
|
||||
self.lsn += xl_tot_len as u64;
|
||||
break;
|
||||
} else {
|
||||
// Need to assemble the record from pieces. Remember the size of the
|
||||
// record, and loop back. On next iteration, we will reach the 'else'
|
||||
// branch below, and copy the part of the record that was on this page
|
||||
// to 'recordbuf'. Subsequent iterations will skip page headers, and
|
||||
// append the continuations from the next pages to 'recordbuf'.
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.contlen = xl_tot_len;
|
||||
continue;
|
||||
}
|
||||
self.recordbuf.clear();
|
||||
self.recordbuf.reserve(xl_tot_len as usize);
|
||||
self.recordbuf.put_u32_le(xl_tot_len);
|
||||
|
||||
self.contlen = xl_tot_len - 4;
|
||||
continue;
|
||||
} else {
|
||||
// we're continuing a record, possibly from previous page.
|
||||
let pageleft = self.lsn.remaining_in_block() as u32;
|
||||
@@ -175,42 +159,47 @@ impl WalStreamDecoder {
|
||||
self.contlen -= n as u32;
|
||||
|
||||
if self.contlen == 0 {
|
||||
// The record is now complete.
|
||||
recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
|
||||
break;
|
||||
let recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new());
|
||||
|
||||
let recordbuf = recordbuf.freeze();
|
||||
let mut buf = recordbuf.clone();
|
||||
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen =
|
||||
self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
let mut crc = crc32c_append(0, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
return Ok(Some(result));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// check record boundaries
|
||||
|
||||
// We now have a record in the 'recordbuf' local variable.
|
||||
let xlogrec = XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]);
|
||||
// deal with continuation records
|
||||
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &recordbuf[XLOG_RECORD_CRC_OFFS + 4..]);
|
||||
crc = crc32c_append(crc, &recordbuf[0..XLOG_RECORD_CRC_OFFS]);
|
||||
if crc != xlogrec.xl_crc {
|
||||
return Err(WalDecodeError {
|
||||
msg: "WAL record crc mismatch".into(),
|
||||
lsn: self.lsn,
|
||||
});
|
||||
}
|
||||
|
||||
// XLOG_SWITCH records are special. If we see one, we need to skip
|
||||
// to the next WAL segment.
|
||||
if xlogrec.is_xlog_switch_record() {
|
||||
trace!("saw xlog switch record at {}", self.lsn);
|
||||
self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
|
||||
} else {
|
||||
// Pad to an 8-byte boundary
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
Ok(Some(result))
|
||||
// deal with xlog_switch records
|
||||
}
|
||||
}
|
||||
|
||||
@@ -229,18 +218,17 @@ pub struct DecodedBkpBlock {
|
||||
pub blkno: u32,
|
||||
|
||||
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
||||
pub flags: u8,
|
||||
flags: u8,
|
||||
|
||||
/* Information on full-page image, if any */
|
||||
pub has_image: bool, /* has image, even for consistency checking */
|
||||
has_image: bool, /* has image, even for consistency checking */
|
||||
pub apply_image: bool, /* has image that should be restored */
|
||||
pub will_init: bool, /* record doesn't need previous page version to apply */
|
||||
//char *bkp_image;
|
||||
pub hole_offset: u16,
|
||||
pub hole_length: u16,
|
||||
pub bimg_offset: u32,
|
||||
pub bimg_len: u16,
|
||||
pub bimg_info: u8,
|
||||
hole_offset: u16,
|
||||
hole_length: u16,
|
||||
bimg_len: u16,
|
||||
bimg_info: u8,
|
||||
|
||||
/* Buffer holding the rmgr-specific data associated with this block */
|
||||
has_data: bool,
|
||||
@@ -860,19 +848,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
|
||||
// 3. Decode blocks.
|
||||
let mut ptr = record.len() - buf.remaining();
|
||||
for blk in blocks.iter_mut() {
|
||||
if blk.has_image {
|
||||
blk.bimg_offset = ptr as u32;
|
||||
ptr += blk.bimg_len as usize;
|
||||
}
|
||||
if blk.has_data {
|
||||
ptr += blk.data_len as usize;
|
||||
}
|
||||
}
|
||||
// We don't need them, so just skip blocks_total_len bytes
|
||||
buf.advance(blocks_total_len as usize);
|
||||
assert_eq!(ptr, record.len() - buf.remaining());
|
||||
|
||||
let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
|
||||
|
||||
|
||||
@@ -8,28 +8,26 @@
|
||||
use crate::relish::*;
|
||||
use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::tenant_threads;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Error, Result};
|
||||
use anyhow::{Error, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use postgres::fallible_iterator::FallibleIterator;
|
||||
use postgres::replication::ReplicationIter;
|
||||
use postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use postgres_protocol::message::backend::ReplicationMessage;
|
||||
use postgres_types::PgLsn;
|
||||
use std::cell::Cell;
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Mutex;
|
||||
use std::thread;
|
||||
use std::thread::sleep;
|
||||
use std::thread::JoinHandle;
|
||||
use std::thread_local;
|
||||
use std::time::{Duration, SystemTime};
|
||||
use tracing::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
@@ -39,8 +37,6 @@ use zenith_utils::zid::ZTimelineId;
|
||||
//
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
wal_receiver_handle: Option<JoinHandle<()>>,
|
||||
tenantid: ZTenantId,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -48,43 +44,6 @@ lazy_static! {
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
thread_local! {
|
||||
// Boolean that is true only for WAL receiver threads
|
||||
//
|
||||
// This is used in `wait_lsn` to guard against usage that might lead to a deadlock.
|
||||
pub(crate) static IS_WAL_RECEIVER: Cell<bool> = Cell::new(false);
|
||||
}
|
||||
|
||||
// Wait for walreceiver to stop
|
||||
// Now it stops when pageserver shutdown is requested.
|
||||
// In future we can make this more granular and send shutdown signals
|
||||
// per tenant/timeline to cancel inactive walreceivers.
|
||||
// TODO deal with blocking pg connections
|
||||
pub fn stop_wal_receiver(timelineid: ZTimelineId) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
if let Some(r) = receivers.get_mut(&timelineid) {
|
||||
r.wal_receiver_handle.take();
|
||||
// r.wal_receiver_handle.take().map(JoinHandle::join);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn drop_wal_receiver(timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
receivers.remove(&timelineid);
|
||||
|
||||
// Check if it was the last walreceiver of the tenant.
|
||||
// TODO now we store one WalReceiverEntry per timeline,
|
||||
// so this iterator looks a bit strange.
|
||||
for (_timelineid, entry) in receivers.iter() {
|
||||
if entry.tenantid == tenantid {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// When last walreceiver of the tenant is gone, change state to Idle
|
||||
tenant_mgr::set_tenant_state(tenantid, TenantState::Idle).unwrap();
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -99,24 +58,21 @@ pub fn launch_wal_receiver(
|
||||
receiver.wal_producer_connstr = wal_producer_connstr.into();
|
||||
}
|
||||
None => {
|
||||
let wal_receiver_handle = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
IS_WAL_RECEIVER.with(|c| c.set(true));
|
||||
thread_main(conf, timelineid, tenantid);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
wal_receiver_handle: Some(wal_receiver_handle),
|
||||
tenantid,
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Update tenant state and start tenant threads, if they are not running yet.
|
||||
tenant_mgr::set_tenant_state(tenantid, TenantState::Active).unwrap();
|
||||
tenant_threads::start_tenant_threads(conf, tenantid);
|
||||
// Also launch a new thread to handle this connection
|
||||
//
|
||||
// NOTE: This thread name is checked in the assertion in wait_lsn. If you change
|
||||
// this, make sure you update the assertion too.
|
||||
let _walreceiver_thread = thread::Builder::new()
|
||||
.name("WAL receiver thread".into())
|
||||
.spawn(move || {
|
||||
thread_main(conf, timelineid, tenantid);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -136,18 +92,16 @@ fn get_wal_producer_connstr(timelineid: ZTimelineId) -> String {
|
||||
// This is the entry point for the WAL receiver thread.
|
||||
//
|
||||
fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
info!("WAL receiver thread started");
|
||||
|
||||
let mut retry_count = 10;
|
||||
info!(
|
||||
"WAL receiver thread started for timeline : '{}'",
|
||||
timelineid
|
||||
);
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
// TODO How long should we retry in case of losing connection?
|
||||
// Should we retry at all or we can wait for the next callmemaybe request?
|
||||
//
|
||||
while !tenant_mgr::shutdown_requested() && retry_count > 0 {
|
||||
loop {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
@@ -158,24 +112,13 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
retry_count -= 1;
|
||||
sleep(Duration::from_secs(1));
|
||||
} else {
|
||||
info!(
|
||||
"walreceiver disconnected tenant {}, timelineid {}",
|
||||
tenantid, timelineid
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
info!("WAL streaming shut down");
|
||||
// Drop it from list of active WAL_RECEIVERS
|
||||
// so that next callmemaybe request launched a new thread
|
||||
drop_wal_receiver(timelineid, tenantid);
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
_conf: &PageServerConf,
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
wal_producer_connstr: &str,
|
||||
tenantid: ZTenantId,
|
||||
@@ -215,15 +158,15 @@ fn walreceiver_main(
|
||||
let mut startpoint = last_rec_lsn;
|
||||
|
||||
if startpoint == Lsn(0) {
|
||||
bail!("No previous WAL position");
|
||||
error!("No previous WAL position");
|
||||
}
|
||||
|
||||
// There might be some padding after the last full record, skip it.
|
||||
startpoint += startpoint.calc_padding(8u32);
|
||||
|
||||
info!(
|
||||
"last_record_lsn {} starting replication from {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, end_of_wal
|
||||
"last_record_lsn {} starting replication from {} for timeline {}, server is at {}...",
|
||||
last_rec_lsn, startpoint, timelineid, end_of_wal
|
||||
);
|
||||
|
||||
let query = format!("START_REPLICATION PHYSICAL {}", startpoint);
|
||||
@@ -245,38 +188,34 @@ fn walreceiver_main(
|
||||
let data = xlog_data.data();
|
||||
let startlsn = Lsn::from(xlog_data.wal_start());
|
||||
let endlsn = startlsn + data.len() as u64;
|
||||
let prev_last_rec_lsn = last_rec_lsn;
|
||||
|
||||
trace!("received XLogData between {} and {}", startlsn, endlsn);
|
||||
|
||||
waldecoder.feed_bytes(data);
|
||||
|
||||
while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let _enter = info_span!("processing record", lsn = %lsn).entered();
|
||||
// Save old checkpoint value to compare with it after decoding WAL record
|
||||
let old_checkpoint_bytes = checkpoint.encode();
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
|
||||
// It is important to deal with the aligned records as lsn in getPage@LSN is
|
||||
// aligned and can be several bytes bigger. Without this alignment we are
|
||||
// at risk of hittind a deadlock.
|
||||
assert!(lsn.is_aligned());
|
||||
|
||||
let writer = timeline.writer();
|
||||
|
||||
let mut checkpoint_modified = false;
|
||||
|
||||
let decoded = decode_wal_record(recdata.clone());
|
||||
restore_local_repo::save_decoded_record(
|
||||
&mut checkpoint,
|
||||
&mut checkpoint_modified,
|
||||
writer.as_ref(),
|
||||
&*timeline,
|
||||
&decoded,
|
||||
recdata,
|
||||
lsn,
|
||||
)?;
|
||||
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
// Check if checkpoint data was updated by save_decoded_record
|
||||
if checkpoint_modified {
|
||||
let new_checkpoint_bytes = checkpoint.encode();
|
||||
|
||||
writer.put_page_image(
|
||||
if new_checkpoint_bytes != old_checkpoint_bytes {
|
||||
timeline.put_page_image(
|
||||
RelishTag::Checkpoint,
|
||||
0,
|
||||
lsn,
|
||||
@@ -286,10 +225,38 @@ fn walreceiver_main(
|
||||
|
||||
// Now that this record has been fully handled, including updating the
|
||||
// checkpoint data, let the repository know that it is up-to-date to this LSN
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
timeline.advance_last_record_lsn(lsn);
|
||||
last_rec_lsn = lsn;
|
||||
}
|
||||
|
||||
// Somewhat arbitrarily, if we have at least 10 complete wal segments (16 MB each),
|
||||
// "checkpoint" the repository to flush all the changes from WAL we've processed
|
||||
// so far to disk. After this, we don't need the original WAL anymore, and it
|
||||
// can be removed. This is probably too aggressive for production, but it's useful
|
||||
// to expose bugs now.
|
||||
//
|
||||
// TODO: We don't actually dare to remove the WAL. It's useful for debugging,
|
||||
// and we might it for logical decoding other things in the future. Although
|
||||
// we should also be able to fetch it back from the WAL safekeepers or S3 if
|
||||
// needed.
|
||||
if prev_last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
!= last_rec_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE)
|
||||
{
|
||||
info!("switched segment {} to {}", prev_last_rec_lsn, last_rec_lsn);
|
||||
let (oldest_segno, newest_segno) = find_wal_file_range(
|
||||
conf,
|
||||
&timelineid,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
last_rec_lsn,
|
||||
&tenantid,
|
||||
)?;
|
||||
|
||||
if newest_segno - oldest_segno >= 10 {
|
||||
// TODO: This is where we could remove WAL older than last_rec_lsn.
|
||||
//remove_wal_files(timelineid, pg_constants::WAL_SEGMENT_SIZE, last_rec_lsn)?;
|
||||
}
|
||||
}
|
||||
|
||||
if !caught_up && endlsn >= end_of_wal {
|
||||
info!("caught up at LSN {}", endlsn);
|
||||
caught_up = true;
|
||||
@@ -311,7 +278,7 @@ fn walreceiver_main(
|
||||
);
|
||||
|
||||
if reply_requested {
|
||||
Some(last_rec_lsn)
|
||||
Some(timeline.get_last_record_lsn())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
@@ -321,36 +288,59 @@ fn walreceiver_main(
|
||||
};
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(last_lsn));
|
||||
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let write_lsn = last_lsn;
|
||||
// This value doesn't guarantee data durability, but it's ok.
|
||||
// In setup with WAL service, pageserver durability is guaranteed by safekeepers.
|
||||
// In setup without WAL service, we just don't care.
|
||||
let flush_lsn = write_lsn;
|
||||
// `disk_consistent_lsn` is the LSN at which page server guarantees persistence of all received data
|
||||
// Depending on the setup we recieve WAL directly from Compute Node or
|
||||
// from a WAL service.
|
||||
//
|
||||
// Senders use the feedback to determine if we are caught up:
|
||||
// - Safekeepers are free to remove WAL preceding `apply_lsn`,
|
||||
// as it will never be requested by this page server.
|
||||
// - Compute Node uses 'apply_lsn' to calculate a lag for back pressure mechanism
|
||||
// (delay WAL inserts to avoid lagging pageserver responses and WAL overflow).
|
||||
let apply_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::from(0);
|
||||
let ts = SystemTime::now();
|
||||
const NO_REPLY: u8 = 0;
|
||||
|
||||
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
if tenant_mgr::shutdown_requested() {
|
||||
debug!("stop walreceiver because pageserver shutdown is requested");
|
||||
break;
|
||||
fn find_wal_file_range(
|
||||
conf: &PageServerConf,
|
||||
timeline: &ZTimelineId,
|
||||
wal_seg_size: usize,
|
||||
written_upto: Lsn,
|
||||
tenant: &ZTenantId,
|
||||
) -> Result<(u64, u64)> {
|
||||
let written_upto_segno = written_upto.segment_number(wal_seg_size);
|
||||
|
||||
let mut oldest_segno = written_upto_segno;
|
||||
let mut newest_segno = written_upto_segno;
|
||||
// Scan the wal directory, and count how many WAL filed we could remove
|
||||
let wal_dir = conf.wal_dir_path(timeline, tenant);
|
||||
for entry in fs::read_dir(wal_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let filename = path.file_name().unwrap().to_str().unwrap();
|
||||
|
||||
if IsXLogFileName(filename) {
|
||||
let (segno, _tli) = XLogFromFileName(filename, wal_seg_size);
|
||||
|
||||
if segno > written_upto_segno {
|
||||
// that's strange.
|
||||
warn!("there is a WAL file from future at {}", path.display());
|
||||
continue;
|
||||
}
|
||||
|
||||
oldest_segno = min(oldest_segno, segno);
|
||||
newest_segno = max(newest_segno, segno);
|
||||
}
|
||||
}
|
||||
// FIXME: would be good to assert that there are no gaps in the WAL files
|
||||
|
||||
Ok(())
|
||||
Ok((oldest_segno, newest_segno))
|
||||
}
|
||||
|
||||
/// Data returned from the postgres `IDENTIFY_SYSTEM` command
|
||||
|
||||
@@ -22,24 +22,23 @@ use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use nix::poll::*;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::io::Error;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::time::timeout;
|
||||
use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::nonblock::set_nonblock;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::relish::*;
|
||||
@@ -54,8 +53,6 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::XLogRecord;
|
||||
|
||||
const N_WAL_REDO_PROCS: usize = 1;
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
///
|
||||
@@ -85,7 +82,7 @@ pub trait WalRedoManager: Send + Sync {
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError>;
|
||||
}
|
||||
|
||||
@@ -102,7 +99,7 @@ impl crate::walredo::WalRedoManager for DummyRedoManager {
|
||||
_blknum: u32,
|
||||
_lsn: Lsn,
|
||||
_base_img: Option<Bytes>,
|
||||
_records: Vec<(Lsn, WALRecord)>,
|
||||
_records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
Err(WalRedoError::InvalidState)
|
||||
}
|
||||
@@ -142,8 +139,8 @@ pub struct PostgresRedoManager {
|
||||
tenantid: ZTenantId,
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
round_robin: AtomicUsize,
|
||||
processes: [Mutex<Option<PostgresRedoProcess>>; N_WAL_REDO_PROCS],
|
||||
runtime: tokio::runtime::Runtime,
|
||||
process: Mutex<Option<PostgresRedoProcess>>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -153,16 +150,9 @@ struct WalRedoRequest {
|
||||
lsn: Lsn,
|
||||
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
records: Vec<WALRecord>,
|
||||
}
|
||||
|
||||
impl WalRedoRequest {
|
||||
// Can this request be served by zenith redo funcitons
|
||||
// or we need to pass it to wal-redo postgres process?
|
||||
fn can_apply_in_zenith(&self) -> bool {
|
||||
!matches!(self.rel, RelishTag::Relation(_))
|
||||
}
|
||||
}
|
||||
/// An error happened in WAL redo
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum WalRedoError {
|
||||
@@ -171,8 +161,6 @@ pub enum WalRedoError {
|
||||
|
||||
#[error("cannot perform WAL redo now")]
|
||||
InvalidState,
|
||||
#[error("cannot perform WAL redo for this request")]
|
||||
InvalidRequest,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -191,9 +179,10 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: Vec<(Lsn, WALRecord)>,
|
||||
records: Vec<WALRecord>,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let start_time;
|
||||
let lock_time;
|
||||
let end_time;
|
||||
|
||||
let request = WalRedoRequest {
|
||||
@@ -205,38 +194,26 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
};
|
||||
|
||||
start_time = Instant::now();
|
||||
let result;
|
||||
|
||||
if request.can_apply_in_zenith() {
|
||||
result = self.handle_apply_request_zenith(&request);
|
||||
|
||||
end_time = Instant::now();
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
|
||||
} else {
|
||||
let rr = self.round_robin.fetch_add(1, Ordering::Relaxed) % N_WAL_REDO_PROCS;
|
||||
let mut process_guard = self.processes[rr].lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
let result = {
|
||||
let mut process_guard = self.process.lock().unwrap();
|
||||
lock_time = Instant::now();
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
if process_guard.is_none() {
|
||||
let p = PostgresRedoProcess::launch(self.conf, &self.tenantid, rr)?;
|
||||
let p = self
|
||||
.runtime
|
||||
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
|
||||
result = self.handle_apply_request_postgres(process, &request);
|
||||
self.runtime
|
||||
.block_on(self.handle_apply_request(process, &request))
|
||||
};
|
||||
end_time = Instant::now();
|
||||
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
end_time = Instant::now();
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
|
||||
|
||||
// If something went wrong, don't try to reuse the process. Kill it, and
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
let process = process_guard.take().unwrap();
|
||||
process.kill();
|
||||
}
|
||||
}
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
|
||||
|
||||
result
|
||||
}
|
||||
@@ -247,57 +224,31 @@ impl PostgresRedoManager {
|
||||
/// Create a new PostgresRedoManager.
|
||||
///
|
||||
pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
// The actual process is launched lazily, on first request.
|
||||
PostgresRedoManager {
|
||||
runtime,
|
||||
tenantid,
|
||||
conf,
|
||||
round_robin: AtomicUsize::new(0),
|
||||
processes: [(); N_WAL_REDO_PROCS].map(|_| Mutex::new(None)),
|
||||
process: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
/// Process one request for WAL redo.
|
||||
///
|
||||
fn handle_apply_request_postgres(
|
||||
async fn handle_apply_request(
|
||||
&self,
|
||||
process: &mut PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
) -> Result<Bytes, WalRedoError> {
|
||||
let blknum = request.blknum;
|
||||
let lsn = request.lsn;
|
||||
let base_img = request.base_img.clone();
|
||||
let records = &request.records;
|
||||
let nrecords = records.len();
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
|
||||
if let RelishTag::Relation(rel) = request.rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records);
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_micros(),
|
||||
lsn
|
||||
);
|
||||
|
||||
apply_result.map_err(WalRedoError::IoError)
|
||||
} else {
|
||||
Err(WalRedoError::InvalidRequest)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo using custom zenith code
|
||||
///
|
||||
fn handle_apply_request_zenith(&self, request: &WalRedoRequest) -> Result<Bytes, WalRedoError> {
|
||||
let rel = request.rel;
|
||||
let blknum = request.blknum;
|
||||
let lsn = request.lsn;
|
||||
@@ -309,158 +260,176 @@ impl PostgresRedoManager {
|
||||
let start = Instant::now();
|
||||
|
||||
let apply_result: Result<Bytes, Error>;
|
||||
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
// If full-page image is provided, then use it...
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
if let RelishTag::Relation(rel) = rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
|
||||
} else {
|
||||
// otherwise initialize page with zeros
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for (_lsn, record) in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
const ZERO_PAGE: [u8; 8192] = [0u8; 8192];
|
||||
let mut page = BytesMut::new();
|
||||
if let Some(fpi) = base_img {
|
||||
// If full-page image is provided, then use it...
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
// otherwise initialize page with zeros
|
||||
page.extend_from_slice(&ZERO_PAGE);
|
||||
}
|
||||
// Apply all collected WAL records
|
||||
for record in records {
|
||||
let mut buf = record.rec.clone();
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
assert!(
|
||||
slru == SlruKind::Clog,
|
||||
"Not valid XACT relish tag {:?}",
|
||||
rel
|
||||
);
|
||||
segno
|
||||
}
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
};
|
||||
let parsed_xact =
|
||||
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
// 1. Parse XLogRecord struct
|
||||
// FIXME: refactor to avoid code duplication.
|
||||
let xlogrec = XLogRecord::from_bytes(&mut buf);
|
||||
|
||||
//move to main data
|
||||
// TODO probably, we should store some records in our special format
|
||||
// to avoid this weird parsing on replay
|
||||
let skip = (record.main_data_offset - pg_constants::SIZEOF_XLOGRECORD) as usize;
|
||||
if buf.remaining() > skip {
|
||||
buf.advance(skip);
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multixact operations
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if let RelishTag::Slru {
|
||||
slru,
|
||||
segno: rec_segno,
|
||||
} = rel
|
||||
|
||||
if xlogrec.xl_rmid == pg_constants::RM_XACT_ID {
|
||||
// Transaction manager stuff
|
||||
let rec_segno = match rel {
|
||||
RelishTag::Slru { slru, segno } => {
|
||||
if slru != SlruKind::Clog {
|
||||
panic!("Not valid XACT relish tag {:?}", rel);
|
||||
}
|
||||
segno
|
||||
}
|
||||
_ => panic!("Not valid XACT relish tag {:?}", rel),
|
||||
};
|
||||
let parsed_xact =
|
||||
XlXactParsedRecord::decode(&mut buf, xlogrec.xl_xid, xlogrec.xl_info);
|
||||
if parsed_xact.info == pg_constants::XLOG_XACT_COMMIT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_COMMIT_PREPARED
|
||||
{
|
||||
if slru == SlruKind::MultiXactMembers {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let pageno = i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if segno == rec_segno && rpageno == blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &=
|
||||
!(((1 << pg_constants::MXACT_MEMBER_BITS_PER_XACT) - 1)
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_COMMITTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else if parsed_xact.info == pg_constants::XLOG_XACT_ABORT
|
||||
|| parsed_xact.info == pg_constants::XLOG_XACT_ABORT_PREPARED
|
||||
{
|
||||
transaction_id_set_status(
|
||||
parsed_xact.xid,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
for subxact in &parsed_xact.subxacts {
|
||||
let pageno = *subxact as u32 / pg_constants::CLOG_XACTS_PER_PAGE;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
// only update xids on the requested page
|
||||
if rec_segno == segno && blknum == rpageno {
|
||||
transaction_id_set_status(
|
||||
*subxact,
|
||||
pg_constants::TRANSACTION_STATUS_ABORTED,
|
||||
&mut page,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if xlogrec.xl_rmid == pg_constants::RM_MULTIXACT_ID {
|
||||
// Multixact operations
|
||||
let info = xlogrec.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(&mut buf);
|
||||
if let RelishTag::Slru {
|
||||
slru,
|
||||
segno: rec_segno,
|
||||
} = rel
|
||||
{
|
||||
if slru == SlruKind::MultiXactMembers {
|
||||
for i in 0..xlrec.nmembers {
|
||||
let pageno =
|
||||
i / pg_constants::MULTIXACT_MEMBERS_PER_PAGE as u32;
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
if segno == rec_segno && rpageno == blknum {
|
||||
// update only target block
|
||||
let offset = xlrec.moff + i;
|
||||
let memberoff = mx_offset_to_member_offset(offset);
|
||||
let flagsoff = mx_offset_to_flags_offset(offset);
|
||||
let bshift = mx_offset_to_flags_bitshift(offset);
|
||||
let mut flagsval =
|
||||
LittleEndian::read_u32(&page[flagsoff..flagsoff + 4]);
|
||||
flagsval &= !(((1
|
||||
<< pg_constants::MXACT_MEMBER_BITS_PER_XACT)
|
||||
- 1)
|
||||
<< bshift);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
flagsval |= xlrec.members[i as usize].status << bshift;
|
||||
LittleEndian::write_u32(
|
||||
&mut page[flagsoff..flagsoff + 4],
|
||||
flagsval,
|
||||
);
|
||||
LittleEndian::write_u32(
|
||||
&mut page[memberoff..memberoff + 4],
|
||||
xlrec.members[i as usize].xid,
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid
|
||||
% pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
}
|
||||
} else {
|
||||
// Multixact offsets SLRU
|
||||
let offs = (xlrec.mid % pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32
|
||||
* 4) as usize;
|
||||
LittleEndian::write_u32(&mut page[offs..offs + 4], xlrec.moff);
|
||||
panic!();
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
}
|
||||
} else {
|
||||
panic!();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
apply_result = Ok::<Bytes, Error>(page.freeze());
|
||||
}
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
let result: Result<Bytes, WalRedoError>;
|
||||
|
||||
debug!(
|
||||
"zenith applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
"applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_millis(),
|
||||
lsn
|
||||
);
|
||||
|
||||
apply_result.map_err(WalRedoError::IoError)
|
||||
if let Err(e) = apply_result {
|
||||
error!("could not apply WAL records: {}", e);
|
||||
result = Err(WalRedoError::IoError(e));
|
||||
} else {
|
||||
let img = apply_result.unwrap();
|
||||
|
||||
result = Ok(img);
|
||||
}
|
||||
|
||||
// The caller is responsible for sending the response
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
@@ -468,33 +437,28 @@ impl PostgresRedoManager {
|
||||
/// Handle to the Postgres WAL redo process
|
||||
///
|
||||
struct PostgresRedoProcess {
|
||||
child: Child,
|
||||
stdin: ChildStdin,
|
||||
stdout: ChildStdout,
|
||||
stderr: ChildStderr,
|
||||
}
|
||||
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
fn launch(
|
||||
async fn launch(
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
id: usize,
|
||||
) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = conf
|
||||
.tenant_path(tenantid)
|
||||
.join(format! {"wal-redo-datadir-{}", id});
|
||||
let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");
|
||||
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
info!("directory {:?} exists, removing", &datadir);
|
||||
if let Err(e) = fs::remove_dir_all(&datadir) {
|
||||
error!("could not remove old wal-redo-datadir: {:#}", e);
|
||||
error!("could not remove old wal-redo-datadir: {:?}", e);
|
||||
}
|
||||
}
|
||||
info!("running initdb in {:?}", datadir.display());
|
||||
@@ -505,6 +469,7 @@ impl PostgresRedoProcess {
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.status.success() {
|
||||
@@ -541,139 +506,108 @@ impl PostgresRedoProcess {
|
||||
datadir.display()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
let stderr = child.stderr.take().expect("failed to open child's stderr");
|
||||
let stdout = child.stdout.take().expect("failed to open child's stdout");
|
||||
|
||||
set_nonblock(stdin.as_raw_fd())?;
|
||||
set_nonblock(stdout.as_raw_fd())?;
|
||||
set_nonblock(stderr.as_raw_fd())?;
|
||||
// This async block reads the child's stderr, and forwards it to the logger
|
||||
let f_stderr = async {
|
||||
let mut stderr_buffered = tokio::io::BufReader::new(stderr);
|
||||
|
||||
Ok(PostgresRedoProcess {
|
||||
child,
|
||||
stdin,
|
||||
stdout,
|
||||
stderr,
|
||||
})
|
||||
}
|
||||
let mut line = String::new();
|
||||
loop {
|
||||
let res = stderr_buffered.read_line(&mut line).await;
|
||||
if res.is_err() {
|
||||
debug!("could not convert line to utf-8");
|
||||
continue;
|
||||
}
|
||||
if res.unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
error!("wal-redo-postgres: {}", line.trim());
|
||||
line.clear();
|
||||
}
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
fn kill(mut self) {
|
||||
let _ = self.child.kill();
|
||||
if let Ok(exit_status) = self.child.wait() {
|
||||
error!("wal-redo-postgres exited with code {}", exit_status);
|
||||
}
|
||||
drop(self);
|
||||
Ok(PostgresRedoProcess { stdin, stdout })
|
||||
}
|
||||
|
||||
//
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
fn apply_wal_records(
|
||||
async fn apply_wal_records(
|
||||
&mut self,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, WALRecord)],
|
||||
records: &[WALRecord],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
let mut writebuf: Vec<u8> = Vec::new();
|
||||
build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
build_push_page_msg(tag, &img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
|
||||
}
|
||||
build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
// The input is now in 'writebuf'. Do a blind write first, writing as much as
|
||||
// we can, before calling poll(). That skips one call to poll() if the stdin is
|
||||
// already available for writing, which it almost certainly is because the
|
||||
// process is idle.
|
||||
let mut nwrite = self.stdin.write(&writebuf)?;
|
||||
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
|
||||
// Prepare for calling poll()
|
||||
let mut pollfds = [
|
||||
PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
|
||||
PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
|
||||
PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
|
||||
];
|
||||
let stdout = &mut self.stdout;
|
||||
// Buffer the writes to avoid a lot of small syscalls.
|
||||
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
while nresult < pg_constants::BLCKSZ.into() {
|
||||
// If we have more data to write, wake up if 'stdin' becomes writeable or
|
||||
// we have data to read. Otherwise only wake up if there's data to read.
|
||||
let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
|
||||
let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
|
||||
|
||||
if n == 0 {
|
||||
return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
|
||||
//
|
||||
// 'f_stdin' handles writing the base image and WAL records to the child process.
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if base_img.is_some() {
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_push_page_msg(tag, base_img.unwrap())),
|
||||
)
|
||||
.await??;
|
||||
}
|
||||
|
||||
// If we have some messages in stderr, forward them to the log.
|
||||
let err_revents = pollfds[1].revents().unwrap();
|
||||
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
let mut errbuf: [u8; 16384] = [0; 16384];
|
||||
let n = self.stderr.read(&mut errbuf)?;
|
||||
// Send WAL records.
|
||||
for rec in records.iter() {
|
||||
let r = rec.clone();
|
||||
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
// good enough, the important thing is to get the message to the log.
|
||||
if n > 0 {
|
||||
error!(
|
||||
"wal-redo-postgres: {}",
|
||||
String::from_utf8_lossy(&errbuf[0..n])
|
||||
);
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
// To make sure we capture all log from the process if it fails, keep
|
||||
// reading from the stderr, before checking the stdout.
|
||||
continue;
|
||||
}
|
||||
} else if err_revents.contains(PollFlags::POLLHUP) {
|
||||
return Err(Error::new(
|
||||
ErrorKind::BrokenPipe,
|
||||
"WAL redo process closed its stderr unexpectedly",
|
||||
));
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(r.lsn, r.rec))
|
||||
.await?;
|
||||
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
|
||||
// If we have more data to write and 'stdin' is writeable, do write.
|
||||
if nwrite < writebuf.len() {
|
||||
let in_revents = pollfds[2].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += self.stdin.write(&writebuf[nwrite..])?;
|
||||
} else if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
return Err(Error::new(
|
||||
ErrorKind::BrokenPipe,
|
||||
"WAL redo process closed its stdin unexpectedly",
|
||||
));
|
||||
}
|
||||
}
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += self.stdout.read(&mut resultbuf[nresult..])?;
|
||||
} else if out_revents.contains(PollFlags::POLLHUP) {
|
||||
return Err(Error::new(
|
||||
ErrorKind::BrokenPipe,
|
||||
"WAL redo process closed its stdout unexpectedly",
|
||||
));
|
||||
}
|
||||
}
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
|
||||
Ok(Bytes::from(resultbuf))
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
let res = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -681,42 +615,81 @@ impl PostgresRedoProcess {
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
tag.ser_into(buf)
|
||||
// FIXME: this is a temporary hack that should go away when we refactor
|
||||
// the postgres protocol serialization + handlers.
|
||||
//
|
||||
// BytesMut is a dynamic growable buffer, used a lot in tokio code but
|
||||
// not in the std library. To write to a BytesMut from a serde serializer,
|
||||
// we need to either:
|
||||
// - pre-allocate the required buffer space. This is annoying because we
|
||||
// shouldn't care what the exact serialized size is-- that's the
|
||||
// serializer's job.
|
||||
// - Or, we need to create a temporary "writer" (which implements the
|
||||
// `Write` trait). It's a bit awkward, because the writer consumes the
|
||||
// underlying BytesMut, and we need to extract it later with
|
||||
// `into_inner`.
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: Bytes) -> Bytes {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let mut buf = writer.into_inner();
|
||||
buf.put(base_img);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: Bytes) -> Bytes {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
fn build_get_page_msg(tag: BufferTag) -> Bytes {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = BytesMut::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
let mut writer = buf.writer();
|
||||
tag.ser_into(&mut writer)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
let buf = writer.into_inner();
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf.freeze()
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
use crate::pg_constants;
|
||||
use crate::CheckPoint;
|
||||
use crate::ControlFileData;
|
||||
use crate::FullTransactionId;
|
||||
use crate::XLogLongPageHeaderData;
|
||||
use crate::XLogPageHeaderData;
|
||||
@@ -17,8 +18,8 @@ use crate::XLOG_PAGE_MAGIC;
|
||||
|
||||
use anyhow::{bail, Result};
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::BytesMut;
|
||||
use bytes::{Buf, Bytes};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::max;
|
||||
@@ -43,9 +44,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
|
||||
#[allow(clippy::identity_op)]
|
||||
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
|
||||
// PG timeline is always 1, changing it doesn't have useful meaning in Zenith.
|
||||
pub const PG_TLI: u32 = 1;
|
||||
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = i64;
|
||||
@@ -187,13 +185,8 @@ fn find_end_of_wal_segment(
|
||||
let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
|
||||
if xl_tot_len == 0 {
|
||||
info!(
|
||||
"find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
|
||||
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
|
||||
Lsn(XLogSegNoOffsetToRecPtr(
|
||||
segno,
|
||||
last_valid_rec_pos as u32,
|
||||
wal_seg_size
|
||||
))
|
||||
"find_end_of_wal_segment reached zeros at {:?}",
|
||||
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size))
|
||||
);
|
||||
break; // zeros, reached the end
|
||||
}
|
||||
@@ -308,17 +301,12 @@ pub fn find_end_of_wal(
|
||||
high_segno,
|
||||
);
|
||||
}
|
||||
let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno {
|
||||
start_lsn.segment_offset(wal_seg_size)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
high_offs = find_end_of_wal_segment(
|
||||
data_dir,
|
||||
high_segno,
|
||||
high_tli,
|
||||
wal_seg_size,
|
||||
start_offset,
|
||||
start_lsn.segment_offset(wal_seg_size),
|
||||
)?;
|
||||
}
|
||||
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
|
||||
@@ -341,12 +329,7 @@ pub fn main() {
|
||||
}
|
||||
|
||||
impl XLogRecord {
|
||||
pub fn from_slice(buf: &[u8]) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des(buf).unwrap()
|
||||
}
|
||||
|
||||
pub fn from_bytes<B: Buf>(buf: &mut B) -> XLogRecord {
|
||||
pub fn from_bytes(buf: &mut Bytes) -> XLogRecord {
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
XLogRecord::des_from(&mut buf.reader()).unwrap()
|
||||
}
|
||||
@@ -394,12 +377,10 @@ impl CheckPoint {
|
||||
Ok(CheckPoint::des(buf)?)
|
||||
}
|
||||
|
||||
/// Update next XID based on provided new_xid and stored epoch.
|
||||
/// Next XID should be greater than new_xid. This handles 32-bit
|
||||
/// XID wraparound correctly.
|
||||
///
|
||||
/// Returns 'true' if the XID was updated.
|
||||
pub fn update_next_xid(&mut self, xid: u32) -> bool {
|
||||
// Update next XID based on provided new_xid and stored epoch.
|
||||
// Next XID should be greater than new_xid.
|
||||
// Also take in account 32-bit wrap-around.
|
||||
pub fn update_next_xid(&mut self, xid: u32) {
|
||||
let xid = xid.wrapping_add(XID_CHECKPOINT_INTERVAL - 1) & !(XID_CHECKPOINT_INTERVAL - 1);
|
||||
let full_xid = self.nextXid.value;
|
||||
let new_xid = std::cmp::max(xid + 1, pg_constants::FIRST_NORMAL_TRANSACTION_ID);
|
||||
@@ -410,37 +391,35 @@ impl CheckPoint {
|
||||
// wrap-around
|
||||
epoch += 1;
|
||||
}
|
||||
let nextXid = (epoch << 32) | new_xid as u64;
|
||||
|
||||
if nextXid != self.nextXid.value {
|
||||
self.nextXid = FullTransactionId { value: nextXid };
|
||||
return true;
|
||||
}
|
||||
self.nextXid = FullTransactionId {
|
||||
value: (epoch << 32) | new_xid as u64,
|
||||
};
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Generate new, empty WAL segment.
|
||||
// Generate new WAL segment with single XLOG_CHECKPOINT_SHUTDOWN record.
|
||||
// We need this segment to start compute node.
|
||||
// In order to minimize changes in Postgres core, we prefer to
|
||||
// provide WAL segment from which is can extract checkpoint record in standard way,
|
||||
// rather then implement some alternative mechanism.
|
||||
//
|
||||
pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
|
||||
pub fn generate_wal_segment(pg_control: &ControlFileData) -> Bytes {
|
||||
let mut seg_buf = BytesMut::with_capacity(pg_constants::WAL_SEGMENT_SIZE as usize);
|
||||
|
||||
let pageaddr = XLogSegNoOffsetToRecPtr(segno, 0, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let hdr = XLogLongPageHeaderData {
|
||||
std: {
|
||||
XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER,
|
||||
xlp_tli: PG_TLI,
|
||||
xlp_pageaddr: pageaddr,
|
||||
xlp_tli: 1, // FIXME: always use Postgres timeline 1
|
||||
xlp_pageaddr: pg_control.checkPoint - XLOG_SIZE_OF_XLOG_LONG_PHD as u64,
|
||||
xlp_rem_len: 0,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
}
|
||||
},
|
||||
xlp_sysid: system_id,
|
||||
xlp_sysid: pg_control.system_identifier,
|
||||
xlp_seg_size: pg_constants::WAL_SEGMENT_SIZE as u32,
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
};
|
||||
@@ -448,6 +427,36 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
|
||||
let hdr_bytes = hdr.encode();
|
||||
seg_buf.extend_from_slice(&hdr_bytes);
|
||||
|
||||
let rec_hdr = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD
|
||||
+ SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT
|
||||
+ SIZEOF_CHECKPOINT) as u32,
|
||||
xl_xid: 0, //0 is for InvalidTransactionId
|
||||
xl_prev: 0,
|
||||
xl_info: pg_constants::XLOG_CHECKPOINT_SHUTDOWN,
|
||||
xl_rmid: pg_constants::RM_XLOG_ID,
|
||||
xl_crc: 0,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
};
|
||||
|
||||
let mut rec_shord_hdr_bytes = BytesMut::new();
|
||||
rec_shord_hdr_bytes.put_u8(pg_constants::XLR_BLOCK_ID_DATA_SHORT);
|
||||
rec_shord_hdr_bytes.put_u8(SIZEOF_CHECKPOINT as u8);
|
||||
|
||||
let rec_bytes = rec_hdr.encode();
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
|
||||
//calculate record checksum
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &rec_shord_hdr_bytes[..]);
|
||||
crc = crc32c_append(crc, &checkpoint_bytes[..]);
|
||||
crc = crc32c_append(crc, &rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
|
||||
seg_buf.extend_from_slice(&rec_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
seg_buf.put_u32_le(crc);
|
||||
seg_buf.extend_from_slice(&rec_shord_hdr_bytes);
|
||||
seg_buf.extend_from_slice(&checkpoint_bytes);
|
||||
|
||||
//zero out the rest of the file
|
||||
seg_buf.resize(pg_constants::WAL_SEGMENT_SIZE, 0);
|
||||
seg_buf.freeze()
|
||||
|
||||
@@ -37,27 +37,20 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
|
||||
return cmd
|
||||
|
||||
|
||||
def yapf(fix_inplace: bool) -> str:
|
||||
cmd = "pipenv run yapf --recursive"
|
||||
if fix_inplace:
|
||||
cmd += " --in-place"
|
||||
else:
|
||||
cmd += " --diff"
|
||||
return cmd
|
||||
|
||||
|
||||
def mypy() -> str:
|
||||
return "pipenv run mypy"
|
||||
|
||||
|
||||
def get_commit_files() -> List[str]:
|
||||
files = subprocess.check_output("git diff --cached --name-only --diff-filter=ACM".split())
|
||||
files = subprocess.check_output(
|
||||
"git diff --cached --name-only --diff-filter=ACM".split()
|
||||
)
|
||||
return files.decode().splitlines()
|
||||
|
||||
|
||||
def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False):
|
||||
def check(
|
||||
name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False
|
||||
):
|
||||
print(f"Checking: {name} ", end="")
|
||||
applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files))
|
||||
applicable_files = list(
|
||||
filter(lambda fname: fname.strip().endswith(suffix), changed_files)
|
||||
)
|
||||
if not applicable_files:
|
||||
print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color))
|
||||
return
|
||||
@@ -66,14 +59,7 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
|
||||
res = subprocess.run(cmd.split(), capture_output=True)
|
||||
if res.returncode != 0:
|
||||
print(colorify("[FAILED]", Color.RED, no_color))
|
||||
if name == "mypy":
|
||||
print("Please inspect the output below and fix type mismatches.")
|
||||
else:
|
||||
print("Please inspect the output below and run make fmt to fix automatically.")
|
||||
if suffix == ".py":
|
||||
print("If the output is empty, ensure that you've installed Python tooling by\n"
|
||||
"running 'pipenv install --dev' in the current directory (no root needed)")
|
||||
print()
|
||||
print("Please inspect the output below and run make fmt to fix automatically\n")
|
||||
print(res.stdout.decode())
|
||||
exit(1)
|
||||
|
||||
@@ -82,11 +68,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace")
|
||||
parser.add_argument("--no-color",
|
||||
action="store_true",
|
||||
help="disable colored output",
|
||||
default=not sys.stdout.isatty())
|
||||
parser.add_argument(
|
||||
"--fix-inplace", action="store_true", help="apply fixes inplace"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-color", action="store_true", help="disable colored output", default=not sys.stdout.isatty()
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
files = get_commit_files()
|
||||
@@ -100,17 +87,3 @@ if __name__ == "__main__":
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
check(
|
||||
name="yapf",
|
||||
suffix=".py",
|
||||
cmd=yapf(fix_inplace=args.fix_inplace),
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
check(
|
||||
name="mypy",
|
||||
suffix=".py",
|
||||
cmd=mypy(),
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
|
||||
@@ -1,139 +1,60 @@
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use anyhow::{bail, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::{SocketAddr, ToSocketAddrs};
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
|
||||
use crate::state::ProxyWaiters;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
pub password: Option<String>,
|
||||
pub struct CPlaneApi {
|
||||
auth_endpoint: &'static str,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(untagged)]
|
||||
enum ProxyAuthResponse {
|
||||
Ready { conn_info: DatabaseInfo },
|
||||
Error { error: String },
|
||||
NotReady { ready: bool }, // TODO: get rid of `ready`
|
||||
pub struct DatabaseInfo {
|
||||
pub host: IpAddr, // TODO: allow host name here too
|
||||
pub port: u16,
|
||||
pub dbname: String,
|
||||
pub user: String,
|
||||
pub password: String,
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> anyhow::Result<SocketAddr> {
|
||||
let host_port = format!("{}:{}", self.host, self.port);
|
||||
host_port
|
||||
.to_socket_addrs()
|
||||
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("cannot resolve at least one SocketAddr"))
|
||||
pub fn socket_addr(&self) -> SocketAddr {
|
||||
SocketAddr::new(self.host, self.port)
|
||||
}
|
||||
|
||||
pub fn conn_string(&self) -> String {
|
||||
format!(
|
||||
"dbname={} user={} password={}",
|
||||
self.dbname, self.user, self.password
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DatabaseInfo> for tokio_postgres::Config {
|
||||
fn from(db_info: DatabaseInfo) -> Self {
|
||||
let mut config = tokio_postgres::Config::new();
|
||||
|
||||
config
|
||||
.host(&db_info.host)
|
||||
.port(db_info.port)
|
||||
.dbname(&db_info.dbname)
|
||||
.user(&db_info.user);
|
||||
|
||||
if let Some(password) = db_info.password {
|
||||
config.password(password);
|
||||
}
|
||||
|
||||
config
|
||||
impl CPlaneApi {
|
||||
pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
|
||||
CPlaneApi { auth_endpoint }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CPlaneApi<'a> {
|
||||
auth_endpoint: &'a str,
|
||||
waiters: &'a ProxyWaiters,
|
||||
}
|
||||
|
||||
impl<'a> CPlaneApi<'a> {
|
||||
pub fn new(auth_endpoint: &'a str, waiters: &'a ProxyWaiters) -> Self {
|
||||
Self {
|
||||
auth_endpoint,
|
||||
waiters,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CPlaneApi<'_> {
|
||||
pub fn authenticate_proxy_request(
|
||||
&self,
|
||||
user: &str,
|
||||
database: &str,
|
||||
md5_response: &[u8],
|
||||
salt: &[u8; 4],
|
||||
psql_session_id: &str,
|
||||
) -> anyhow::Result<DatabaseInfo> {
|
||||
) -> Result<DatabaseInfo> {
|
||||
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", user)
|
||||
.append_pair("database", database)
|
||||
.append_pair("md5response", std::str::from_utf8(md5_response)?)
|
||||
.append_pair("salt", &hex::encode(salt))
|
||||
.append_pair("psql_session_id", psql_session_id);
|
||||
.append_pair("salt", &hex::encode(salt));
|
||||
|
||||
let waiter = self.waiters.register(psql_session_id.to_owned());
|
||||
println!("cplane request: {}", url.as_str());
|
||||
|
||||
println!("cplane request: {}", url);
|
||||
let resp = reqwest::blocking::get(url)?;
|
||||
if !resp.status().is_success() {
|
||||
bail!("Auth failed: {}", resp.status())
|
||||
}
|
||||
|
||||
let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got auth info: #{:?}", auth_info);
|
||||
|
||||
use ProxyAuthResponse::*;
|
||||
match auth_info {
|
||||
Ready { conn_info } => Ok(conn_info),
|
||||
Error { error } => bail!(error),
|
||||
NotReady { .. } => waiter.wait()?.map_err(|e| anyhow!(e)),
|
||||
if resp.status().is_success() {
|
||||
let conn_info: DatabaseInfo = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got conn info: #{:?}", conn_info);
|
||||
Ok(conn_info)
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_proxy_auth_response() {
|
||||
// Ready
|
||||
let auth: ProxyAuthResponse = serde_json::from_value(json!({
|
||||
"ready": true,
|
||||
"conn_info": DatabaseInfo::default(),
|
||||
}))
|
||||
.unwrap();
|
||||
assert!(matches!(
|
||||
auth,
|
||||
ProxyAuthResponse::Ready {
|
||||
conn_info: DatabaseInfo { .. }
|
||||
}
|
||||
));
|
||||
|
||||
// Error
|
||||
let auth: ProxyAuthResponse = serde_json::from_value(json!({
|
||||
"ready": false,
|
||||
"error": "too bad, so sad",
|
||||
}))
|
||||
.unwrap();
|
||||
assert!(matches!(auth, ProxyAuthResponse::Error { .. }));
|
||||
|
||||
// NotReady
|
||||
let auth: ProxyAuthResponse = serde_json::from_value(json!({
|
||||
"ready": false,
|
||||
}))
|
||||
.unwrap();
|
||||
assert!(matches!(auth, ProxyAuthResponse::NotReady { .. }));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,21 +5,78 @@
|
||||
/// (control plane API in our case) and can create new databases and accounts
|
||||
/// in somewhat transparent manner (again via communication with control plane API).
|
||||
///
|
||||
use anyhow::bail;
|
||||
use clap::{App, Arg};
|
||||
use state::{ProxyConfig, ProxyState};
|
||||
use std::thread;
|
||||
use zenith_utils::{tcp_listener, GIT_VERSION};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{SocketAddr, TcpListener},
|
||||
sync::{mpsc, Arc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
|
||||
use cplane_api::DatabaseInfo;
|
||||
use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
|
||||
|
||||
mod cplane_api;
|
||||
mod mgmt;
|
||||
mod proxy;
|
||||
mod state;
|
||||
mod waiters;
|
||||
|
||||
pub struct ProxyConf {
|
||||
/// main entrypoint for users to connect to
|
||||
pub proxy_address: SocketAddr,
|
||||
|
||||
/// http management endpoint. Upon user account creation control plane
|
||||
/// will notify us here, so that we can 'unfreeze' user session.
|
||||
pub mgmt_address: SocketAddr,
|
||||
|
||||
/// send unauthenticated users to this URI
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub auth_endpoint: String,
|
||||
|
||||
pub ssl_config: Option<Arc<ServerConfig>>,
|
||||
}
|
||||
|
||||
pub struct ProxyState {
|
||||
pub conf: ProxyConf,
|
||||
pub waiters: Mutex<HashMap<String, mpsc::Sender<anyhow::Result<DatabaseInfo>>>>,
|
||||
}
|
||||
|
||||
fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerConfig>>> {
|
||||
let (key_path, cert_path) = match (
|
||||
arg_matches.value_of("ssl-key"),
|
||||
arg_matches.value_of("ssl-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => (key_path, cert_path),
|
||||
(None, None) => return Ok(None),
|
||||
_ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
|
||||
};
|
||||
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("SSL key file")?;
|
||||
let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS keys"))?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().unwrap()
|
||||
};
|
||||
|
||||
let cert_chain = {
|
||||
let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
|
||||
pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS certificates"))?
|
||||
};
|
||||
|
||||
let mut config = ServerConfig::new(NoClientAuth::new());
|
||||
config.set_single_cert(cert_chain, key)?;
|
||||
config.versions = vec![ProtocolVersion::TLSv1_3];
|
||||
|
||||
Ok(Some(Arc::new(config)))
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("Zenith proxy/router")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::with_name("proxy")
|
||||
.short("p")
|
||||
@@ -68,47 +125,38 @@ fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let ssl_config = match (
|
||||
arg_matches.value_of("ssl-key"),
|
||||
arg_matches.value_of("ssl-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => {
|
||||
Some(crate::state::configure_ssl(key_path, cert_path)?)
|
||||
}
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
|
||||
};
|
||||
|
||||
let config = ProxyConfig {
|
||||
let conf = ProxyConf {
|
||||
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
|
||||
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
|
||||
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
|
||||
ssl_config,
|
||||
ssl_config: configure_ssl(&arg_matches)?,
|
||||
};
|
||||
let state: &ProxyState = Box::leak(Box::new(ProxyState::new(config)));
|
||||
|
||||
println!("Version: {}", GIT_VERSION);
|
||||
let state = ProxyState {
|
||||
conf,
|
||||
waiters: Mutex::new(HashMap::new()),
|
||||
};
|
||||
let state: &'static ProxyState = Box::leak(Box::new(state));
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
println!("Starting proxy on {}", state.conf.proxy_address);
|
||||
let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;
|
||||
let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
|
||||
|
||||
println!("Starting mgmt on {}", state.conf.mgmt_address);
|
||||
let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;
|
||||
let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
|
||||
|
||||
let threads = [
|
||||
let threads = vec![
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
thread::Builder::new()
|
||||
.name("Listener thread".into())
|
||||
.name("Proxy thread".into())
|
||||
.spawn(move || proxy::thread_main(state, pageserver_listener))?,
|
||||
thread::Builder::new()
|
||||
.name("Mgmt thread".into())
|
||||
.spawn(move || mgmt::thread_main(state, mgmt_listener))?,
|
||||
];
|
||||
|
||||
for t in threads {
|
||||
for t in threads.into_iter() {
|
||||
t.join().unwrap()?;
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::{
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use serde::Deserialize;
|
||||
use zenith_utils::{
|
||||
@@ -24,23 +25,22 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:
|
||||
socket.set_nodelay(true).unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = handle_connection(state, socket) {
|
||||
if let Err(err) = mgmt_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_connection(state: &ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let mut conn_handler = MgmtHandler { state };
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
struct MgmtHandler<'a> {
|
||||
state: &'a ProxyState,
|
||||
struct MgmtHandler {
|
||||
state: &'static ProxyState,
|
||||
}
|
||||
|
||||
/// Serialized examples:
|
||||
// {
|
||||
// "session_id": "71d6d03e6d93d99a",
|
||||
@@ -64,18 +64,18 @@ struct MgmtHandler<'a> {
|
||||
// // to test manually by sending a query to mgmt interface:
|
||||
// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
|
||||
#[derive(Deserialize)]
|
||||
struct PsqlSessionResponse {
|
||||
pub struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
result: PsqlSessionResult,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
enum PsqlSessionResult {
|
||||
pub enum PsqlSessionResult {
|
||||
Success(DatabaseInfo),
|
||||
Failure(String),
|
||||
}
|
||||
|
||||
impl postgres_backend::Handler for MgmtHandler<'_> {
|
||||
impl postgres_backend::Handler for MgmtHandler {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
@@ -96,26 +96,32 @@ fn try_process_query(
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
|
||||
use PsqlSessionResult::*;
|
||||
let msg = match resp.result {
|
||||
Success(db_info) => Ok(db_info),
|
||||
Failure(message) => Err(message),
|
||||
};
|
||||
let waiters = mgmt.state.waiters.lock().unwrap();
|
||||
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
|
||||
match mgmt.state.waiters.notify(&resp.session_id, msg) {
|
||||
Ok(()) => {
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
pgb.write_message(&BeMessage::ErrorResponse(e.to_string()))?;
|
||||
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,12 +1,18 @@
|
||||
use crate::cplane_api::{CPlaneApi, DatabaseInfo};
|
||||
use crate::cplane_api::CPlaneApi;
|
||||
use crate::cplane_api::DatabaseInfo;
|
||||
use crate::ProxyState;
|
||||
use anyhow::{anyhow, bail};
|
||||
use std::net::TcpStream;
|
||||
use std::{io, thread};
|
||||
|
||||
use anyhow::bail;
|
||||
use tokio_postgres::NoTls;
|
||||
use zenith_utils::postgres_backend::{self, PostgresBackend, ProtoState, Stream};
|
||||
use zenith_utils::pq_proto::{BeMessage as Be, FeMessage as Fe, *};
|
||||
|
||||
use rand::Rng;
|
||||
use std::io::Write;
|
||||
use std::{io, sync::mpsc::channel, thread};
|
||||
use zenith_utils::postgres_backend::Stream;
|
||||
use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
|
||||
use zenith_utils::pq_proto::*;
|
||||
use zenith_utils::sock_split::{ReadStream, WriteStream};
|
||||
use zenith_utils::{postgres_backend, pq_proto::BeMessage};
|
||||
|
||||
///
|
||||
/// Main proxy listener loop.
|
||||
@@ -22,259 +28,271 @@ pub fn thread_main(
|
||||
println!("accepted connection from {}", peer_addr);
|
||||
socket.set_nodelay(true).unwrap();
|
||||
|
||||
thread::Builder::new()
|
||||
.name("Proxy thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = proxy_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
})?;
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = proxy_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: clean up fields
|
||||
// XXX: clean up fields
|
||||
struct ProxyConnection {
|
||||
state: &'static ProxyState,
|
||||
psql_session_id: String,
|
||||
|
||||
cplane: CPlaneApi,
|
||||
|
||||
user: String,
|
||||
database: String,
|
||||
|
||||
pgb: PostgresBackend,
|
||||
md5_salt: [u8; 4],
|
||||
|
||||
psql_session_id: String,
|
||||
}
|
||||
|
||||
pub fn proxy_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let conn = ProxyConnection {
|
||||
pub fn proxy_conn_main(
|
||||
state: &'static ProxyState,
|
||||
socket: std::net::TcpStream,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut conn = ProxyConnection {
|
||||
state,
|
||||
psql_session_id: hex::encode(rand::random::<[u8; 8]>()),
|
||||
cplane: CPlaneApi::new(&state.conf.auth_endpoint),
|
||||
user: "".into(),
|
||||
database: "".into(),
|
||||
pgb: PostgresBackend::new(
|
||||
socket,
|
||||
postgres_backend::AuthType::MD5,
|
||||
state.conf.ssl_config.clone(),
|
||||
false,
|
||||
)?,
|
||||
md5_salt: [0u8; 4],
|
||||
psql_session_id: "".into(),
|
||||
};
|
||||
|
||||
let (client, server) = conn.handle_client()?;
|
||||
// Check StartupMessage
|
||||
// This will set conn.existing_user and we can decide on next actions
|
||||
conn.handle_startup()?;
|
||||
|
||||
let server = zenith_utils::sock_split::BidiStream::from_tcp(server);
|
||||
|
||||
let client = match client {
|
||||
Stream::Bidirectional(bidi_stream) => bidi_stream,
|
||||
_ => panic!("invalid stream type"),
|
||||
// both scenarious here should end up producing database connection string
|
||||
let db_info = if conn.is_existing_user() {
|
||||
conn.handle_existing_user()?
|
||||
} else {
|
||||
conn.handle_new_user()?
|
||||
};
|
||||
|
||||
proxy(client.split(), server.split())
|
||||
// XXX: move that inside handle_new_user/handle_existing_user to be able to
|
||||
// report wrong connection error.
|
||||
proxy_pass(conn.pgb, db_info)
|
||||
}
|
||||
|
||||
impl ProxyConnection {
|
||||
fn handle_client(mut self) -> anyhow::Result<(Stream, TcpStream)> {
|
||||
let mut authenticate = || {
|
||||
let (username, dbname) = self.handle_startup()?;
|
||||
|
||||
// Both scenarios here should end up producing database credentials
|
||||
if username.ends_with("@zenith") {
|
||||
self.handle_existing_user(&username, &dbname)
|
||||
} else {
|
||||
self.handle_new_user()
|
||||
}
|
||||
};
|
||||
|
||||
let conn = match authenticate() {
|
||||
Ok(db_info) => connect_to_db(db_info),
|
||||
Err(e) => {
|
||||
// Report the error to the client
|
||||
self.pgb.write_message(&Be::ErrorResponse(e.to_string()))?;
|
||||
bail!("failed to handle client: {:?}", e);
|
||||
}
|
||||
};
|
||||
|
||||
// We'll get rid of this once migration to async is complete
|
||||
let (pg_version, db_stream) = {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let (pg_version, stream) = runtime.block_on(conn)?;
|
||||
let stream = stream.into_std()?;
|
||||
stream.set_nonblocking(false)?;
|
||||
|
||||
(pg_version, stream)
|
||||
};
|
||||
|
||||
// Let the client send new requests
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus(
|
||||
BeParameterStatusMessage::ServerVersion(&pg_version),
|
||||
))?
|
||||
.write_message(&Be::ReadyForQuery)?;
|
||||
|
||||
Ok((self.pgb.into_stream(), db_stream))
|
||||
fn is_existing_user(&self) -> bool {
|
||||
self.user.ends_with("@zenith")
|
||||
}
|
||||
|
||||
fn handle_startup(&mut self) -> anyhow::Result<(String, String)> {
|
||||
let have_tls = self.pgb.tls_config.is_some();
|
||||
fn handle_startup(&mut self) -> anyhow::Result<()> {
|
||||
let mut encrypted = false;
|
||||
|
||||
loop {
|
||||
let mut msg = match self.pgb.read_message()? {
|
||||
Some(Fe::StartupMessage(msg)) => msg,
|
||||
None => bail!("connection is lost"),
|
||||
bad => bail!("unexpected message type: {:?}", bad),
|
||||
};
|
||||
println!("got message: {:?}", msg);
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
match msg {
|
||||
Some(FeMessage::StartupMessage(m)) => {
|
||||
println!("got startup message {:?}", m);
|
||||
|
||||
match msg.kind {
|
||||
StartupRequestCode::NegotiateGss => {
|
||||
self.pgb.write_message(&Be::EncryptionResponse(false))?;
|
||||
}
|
||||
StartupRequestCode::NegotiateSsl => {
|
||||
self.pgb.write_message(&Be::EncryptionResponse(have_tls))?;
|
||||
if have_tls {
|
||||
self.pgb.start_tls()?;
|
||||
encrypted = true;
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss => {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::EncryptionResponse(false))?;
|
||||
}
|
||||
StartupRequestCode::NegotiateSsl => {
|
||||
println!("SSL requested");
|
||||
if self.pgb.tls_config.is_some() {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::EncryptionResponse(true))?;
|
||||
self.pgb.start_tls()?;
|
||||
encrypted = true;
|
||||
} else {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::EncryptionResponse(false))?;
|
||||
}
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
if self.state.conf.ssl_config.is_some() && !encrypted {
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(
|
||||
"must connect with TLS".to_string(),
|
||||
))?;
|
||||
bail!("client did not connect with TLS");
|
||||
}
|
||||
self.user = m
|
||||
.params
|
||||
.get("user")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("user is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
self.database = m
|
||||
.params
|
||||
.get("database")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("database is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
|
||||
break;
|
||||
}
|
||||
StartupRequestCode::Cancel => break,
|
||||
}
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
if have_tls && !encrypted {
|
||||
bail!("must connect with TLS");
|
||||
}
|
||||
|
||||
let mut get_param = |key| {
|
||||
msg.params
|
||||
.remove(key)
|
||||
.ok_or_else(|| anyhow!("{} is missing in startup packet", key))
|
||||
};
|
||||
|
||||
return Ok((get_param("user")?, get_param("database")?));
|
||||
None => {
|
||||
bail!("connection closed")
|
||||
}
|
||||
unexpected => {
|
||||
bail!("unexpected message type : {:?}", unexpected)
|
||||
}
|
||||
// TODO: implement proper stmt cancellation
|
||||
StartupRequestCode::Cancel => bail!("query cancellation is not supported"),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
// ask password
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
self.pgb
|
||||
.write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
|
||||
self.pgb.state = ProtoState::Authentication; // XXX
|
||||
|
||||
// check password
|
||||
println!("handle_existing_user");
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
if let Some(FeMessage::PasswordMessage(m)) = msg {
|
||||
println!("got password message '{:?}'", m);
|
||||
|
||||
assert!(self.is_existing_user());
|
||||
|
||||
let (_trailing_null, md5_response) = m
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
|
||||
|
||||
match self.cplane.authenticate_proxy_request(
|
||||
self.user.as_str(),
|
||||
md5_response,
|
||||
&self.md5_salt,
|
||||
) {
|
||||
Err(e) => {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::ErrorResponse(format!("{}", e)))?;
|
||||
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
Ok(conn_info) => {
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(conn_info)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
bail!("protocol violation");
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_existing_user(&mut self, user: &str, db: &str) -> anyhow::Result<DatabaseInfo> {
|
||||
let md5_salt = rand::random::<[u8; 4]>();
|
||||
|
||||
// Ask password
|
||||
self.pgb
|
||||
.write_message(&Be::AuthenticationMD5Password(&md5_salt))?;
|
||||
self.pgb.state = ProtoState::Authentication; // XXX
|
||||
|
||||
// Check password
|
||||
let msg = match self.pgb.read_message()? {
|
||||
Some(Fe::PasswordMessage(msg)) => msg,
|
||||
None => bail!("connection is lost"),
|
||||
bad => bail!("unexpected message type: {:?}", bad),
|
||||
};
|
||||
println!("got message: {:?}", msg);
|
||||
|
||||
let (_trailing_null, md5_response) = msg
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow!("unexpected password message"))?;
|
||||
|
||||
let cplane = CPlaneApi::new(&self.state.conf.auth_endpoint, &self.state.waiters);
|
||||
let db_info = cplane.authenticate_proxy_request(
|
||||
user,
|
||||
db,
|
||||
md5_response,
|
||||
&md5_salt,
|
||||
&self.psql_session_id,
|
||||
)?;
|
||||
|
||||
self.pgb
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&BeParameterStatusMessage::encoding())?;
|
||||
|
||||
Ok(db_info)
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
let greeting = hello_message(&self.state.conf.redirect_uri, &self.psql_session_id);
|
||||
let mut psql_session_id_buf = [0u8; 8];
|
||||
rand::thread_rng().fill(&mut psql_session_id_buf);
|
||||
self.psql_session_id = hex::encode(psql_session_id_buf);
|
||||
|
||||
// First, register this session
|
||||
let waiter = self.state.waiters.register(self.psql_session_id.clone());
|
||||
let hello_message = format!("☀️ Welcome to Zenith!
|
||||
|
||||
To proceed with database creation, open the following link:
|
||||
|
||||
{redirect_uri}{sess_id}
|
||||
|
||||
It needs to be done once and we will send you '.pgpass' file, which will allow you to access or create
|
||||
databases without opening the browser.
|
||||
|
||||
", redirect_uri = self.state.conf.redirect_uri, sess_id = self.psql_session_id);
|
||||
|
||||
// Give user a URL to spawn a new database
|
||||
self.pgb
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&BeParameterStatusMessage::encoding())?
|
||||
.write_message(&Be::NoticeResponse(greeting))?;
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb
|
||||
.write_message(&BeMessage::NoticeResponse(hello_message))?;
|
||||
|
||||
// await for database creation
|
||||
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
|
||||
let _ = self
|
||||
.state
|
||||
.waiters
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(self.psql_session_id.clone(), tx);
|
||||
|
||||
// Wait for web console response
|
||||
let db_info = waiter.wait()?.map_err(|e| anyhow!(e))?;
|
||||
// XXX: respond with error to client
|
||||
let dbinfo = rx.recv()??;
|
||||
|
||||
self.pgb
|
||||
.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?;
|
||||
self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
|
||||
"Connecting to database.".to_string(),
|
||||
))?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(db_info)
|
||||
Ok(dbinfo)
|
||||
}
|
||||
}
|
||||
|
||||
fn hello_message(redirect_uri: &str, session_id: &str) -> String {
|
||||
format!(
|
||||
concat![
|
||||
"☀️ Welcome to Zenith!\n",
|
||||
"To proceed with database creation, open the following link:\n\n",
|
||||
" {redirect_uri}{session_id}\n\n",
|
||||
"It needs to be done once and we will send you '.pgpass' file,\n",
|
||||
"which will allow you to access or create ",
|
||||
"databases without opening your web browser."
|
||||
],
|
||||
redirect_uri = redirect_uri,
|
||||
session_id = session_id,
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<(String, tokio::net::TcpStream)> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
|
||||
let config = tokio_postgres::Config::from(db_info);
|
||||
let (client, conn) = config.connect_raw(&mut socket, NoTls).await?;
|
||||
|
||||
let query = client.query_one("select current_setting('server_version')", &[]);
|
||||
|
||||
tokio::pin!(query, conn);
|
||||
|
||||
let version = tokio::select!(
|
||||
x = query => x?.try_get(0)?,
|
||||
_ = conn => bail!("connection closed too early"),
|
||||
);
|
||||
|
||||
Ok((version, socket))
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()).await?;
|
||||
let config = db_info.conn_string().parse::<tokio_postgres::Config>()?;
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
Ok(socket)
|
||||
}
|
||||
|
||||
/// Concurrently proxy both directions of the client and server connections
|
||||
fn proxy(
|
||||
(client_read, client_write): (ReadStream, WriteStream),
|
||||
(server_read, server_write): (ReadStream, WriteStream),
|
||||
client_read: ReadStream,
|
||||
client_write: WriteStream,
|
||||
server_read: ReadStream,
|
||||
server_write: WriteStream,
|
||||
) -> anyhow::Result<()> {
|
||||
fn do_proxy(mut reader: impl io::Read, mut writer: WriteStream) -> io::Result<u64> {
|
||||
/// FlushWriter will make sure that every message is sent as soon as possible
|
||||
struct FlushWriter<W>(W);
|
||||
|
||||
impl<W: io::Write> io::Write for FlushWriter<W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
// `std::io::copy` is guaranteed to exit if we return an error,
|
||||
// so we can afford to lose `res` in case `flush` fails
|
||||
let res = self.0.write(buf);
|
||||
if res.is_ok() {
|
||||
self.flush()?;
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.0.flush()
|
||||
}
|
||||
}
|
||||
|
||||
let res = std::io::copy(&mut reader, &mut FlushWriter(&mut writer));
|
||||
writer.shutdown(std::net::Shutdown::Both)?;
|
||||
res
|
||||
fn do_proxy(mut reader: ReadStream, mut writer: WriteStream) -> io::Result<()> {
|
||||
std::io::copy(&mut reader, &mut writer)?;
|
||||
writer.flush()?;
|
||||
writer.shutdown(std::net::Shutdown::Both)
|
||||
}
|
||||
|
||||
let client_to_server_jh = thread::spawn(move || do_proxy(client_read, server_write));
|
||||
|
||||
do_proxy(server_read, client_write)?;
|
||||
client_to_server_jh.join().unwrap()?;
|
||||
let res1 = do_proxy(server_read, client_write);
|
||||
let res2 = client_to_server_jh.join().unwrap();
|
||||
res1?;
|
||||
res2?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Proxy a client connection to a postgres database
|
||||
fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
let db_stream = runtime.block_on(connect_to_db(db_info))?;
|
||||
let db_stream = db_stream.into_std()?;
|
||||
db_stream.set_nonblocking(false)?;
|
||||
|
||||
let db_stream = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
|
||||
let (db_read, db_write) = db_stream.split();
|
||||
|
||||
let stream = match pgb.into_stream() {
|
||||
Stream::Bidirectional(bidi_stream) => bidi_stream,
|
||||
_ => bail!("invalid stream"),
|
||||
};
|
||||
|
||||
let (client_read, client_write) = stream.split();
|
||||
proxy(client_read, client_write, db_read, db_write)
|
||||
}
|
||||
|
||||
@@ -1,62 +0,0 @@
|
||||
use crate::cplane_api::DatabaseInfo;
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub type SslConfig = Arc<ServerConfig>;
|
||||
|
||||
pub struct ProxyConfig {
|
||||
/// main entrypoint for users to connect to
|
||||
pub proxy_address: SocketAddr,
|
||||
|
||||
/// http management endpoint. Upon user account creation control plane
|
||||
/// will notify us here, so that we can 'unfreeze' user session.
|
||||
pub mgmt_address: SocketAddr,
|
||||
|
||||
/// send unauthenticated users to this URI
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub auth_endpoint: String,
|
||||
|
||||
pub ssl_config: Option<SslConfig>,
|
||||
}
|
||||
|
||||
pub type ProxyWaiters = crate::waiters::Waiters<Result<DatabaseInfo, String>>;
|
||||
|
||||
pub struct ProxyState {
|
||||
pub conf: ProxyConfig,
|
||||
pub waiters: ProxyWaiters,
|
||||
}
|
||||
|
||||
impl ProxyState {
|
||||
pub fn new(conf: ProxyConfig) -> Self {
|
||||
Self {
|
||||
conf,
|
||||
waiters: ProxyWaiters::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result<SslConfig> {
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("SSL key file")?;
|
||||
let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS keys"))?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().unwrap()
|
||||
};
|
||||
|
||||
let cert_chain = {
|
||||
let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
|
||||
pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS certificates"))?
|
||||
};
|
||||
|
||||
let mut config = ServerConfig::new(NoClientAuth::new());
|
||||
config.set_single_cert(cert_chain, key)?;
|
||||
config.versions = vec![ProtocolVersion::TLSv1_3];
|
||||
|
||||
Ok(config.into())
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{mpsc, Mutex};
|
||||
|
||||
pub struct Waiters<T>(pub(self) Mutex<HashMap<String, mpsc::Sender<T>>>);
|
||||
|
||||
impl<T> Default for Waiters<T> {
|
||||
fn default() -> Self {
|
||||
Waiters(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Waiters<T> {
|
||||
pub fn register(&self, key: String) -> Waiter<T> {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
|
||||
// TODO: use `try_insert` (unstable)
|
||||
let prev = self.0.lock().unwrap().insert(key.clone(), tx);
|
||||
assert!(matches!(prev, None)); // assert_matches! is nightly-only
|
||||
|
||||
Waiter {
|
||||
receiver: rx,
|
||||
registry: self,
|
||||
key,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()>
|
||||
where
|
||||
T: Send + Sync + 'static,
|
||||
{
|
||||
let tx = self
|
||||
.0
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(key)
|
||||
.ok_or_else(|| anyhow!("key {} not found", key))?;
|
||||
tx.send(value).context("channel hangup")
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Waiter<'a, T> {
|
||||
receiver: mpsc::Receiver<T>,
|
||||
registry: &'a Waiters<T>,
|
||||
key: String,
|
||||
}
|
||||
|
||||
impl<T> Waiter<'_, T> {
|
||||
pub fn wait(self) -> anyhow::Result<T> {
|
||||
self.receiver.recv().context("channel hangup")
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for Waiter<'_, T> {
|
||||
fn drop(&mut self) {
|
||||
self.registry.0.lock().unwrap().remove(&self.key);
|
||||
}
|
||||
}
|
||||
@@ -1,9 +0,0 @@
|
||||
[pytest]
|
||||
addopts =
|
||||
-m 'not remote_cluster'
|
||||
markers =
|
||||
remote_cluster
|
||||
minversion = 6.0
|
||||
log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
|
||||
log_date_format = %Y-%m-%d %H:%M:%S
|
||||
log_cli = true
|
||||
510
scripts/coverage
510
scripts/coverage
@@ -1,510 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Here'a good link in case you're interested in learning more
|
||||
# about current deficiencies of rust code coverage story:
|
||||
# https://github.com/rust-lang/rust/issues?q=is%3Aissue+is%3Aopen+instrument-coverage+label%3AA-code-coverage
|
||||
#
|
||||
# Also a couple of inspirational tools which I deliberately ended up not using:
|
||||
# * https://github.com/mozilla/grcov
|
||||
# * https://github.com/taiki-e/cargo-llvm-cov
|
||||
# * https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-cov
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from textwrap import dedent
|
||||
from typing import Any, Iterable, List, Optional
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def intersperse(sep: Any, iterable: Iterable[Any]):
|
||||
fst = True
|
||||
for item in iterable:
|
||||
if not fst:
|
||||
yield sep
|
||||
fst = False
|
||||
yield item
|
||||
|
||||
|
||||
def find_demangler(demangler=None):
|
||||
known_tools = ['c++filt', 'rustfilt', 'llvm-cxxfilt']
|
||||
|
||||
if demangler:
|
||||
# Explicit argument has precedence over `known_tools`
|
||||
demanglers = [demangler]
|
||||
else:
|
||||
demanglers = known_tools
|
||||
|
||||
for demangler in demanglers:
|
||||
if shutil.which(demangler):
|
||||
return demangler
|
||||
|
||||
raise Exception(' '.join([
|
||||
'Failed to find symbol demangler.',
|
||||
'Please install it or provide another tool',
|
||||
f"(e.g. {', '.join(known_tools)})",
|
||||
]))
|
||||
|
||||
|
||||
class Cargo:
|
||||
def __init__(self, cwd: Path):
|
||||
self.cwd = cwd
|
||||
self.target_dir = Path(os.environ.get('CARGO_TARGET_DIR', cwd / 'target')).resolve()
|
||||
self._rustlib_dir = None
|
||||
|
||||
@property
|
||||
def rustlib_dir(self):
|
||||
if not self._rustlib_dir:
|
||||
cmd = [
|
||||
'cargo',
|
||||
'-Zunstable-options',
|
||||
'rustc',
|
||||
'--print=target-libdir',
|
||||
]
|
||||
self._rustlib_dir = Path(subprocess.check_output(cmd, cwd=self.cwd, text=True)).parent
|
||||
|
||||
return self._rustlib_dir
|
||||
|
||||
def binaries(self, profile: str) -> List[str]:
|
||||
executables = []
|
||||
|
||||
# This will emit json messages containing test binaries names
|
||||
cmd = [
|
||||
'cargo',
|
||||
'test',
|
||||
'--no-run',
|
||||
'--message-format=json',
|
||||
]
|
||||
env = dict(os.environ, PROFILE=profile)
|
||||
output = subprocess.check_output(cmd, cwd=self.cwd, env=env, text=True)
|
||||
|
||||
for line in output.splitlines(keepends=False):
|
||||
meta = json.loads(line)
|
||||
exe = meta.get('executable')
|
||||
if exe:
|
||||
executables.append(exe)
|
||||
|
||||
# Metadata contains crate names, which can be used
|
||||
# to recover names of executables, e.g. `pageserver`
|
||||
cmd = [
|
||||
'cargo',
|
||||
'metadata',
|
||||
'--format-version=1',
|
||||
'--no-deps',
|
||||
]
|
||||
meta = json.loads(subprocess.check_output(cmd, cwd=self.cwd))
|
||||
|
||||
for pkg in meta.get('packages', []):
|
||||
for target in pkg.get('targets', []):
|
||||
if 'bin' in target['kind']:
|
||||
exe = self.target_dir / profile / target['name']
|
||||
if exe.exists():
|
||||
executables.append(str(exe))
|
||||
|
||||
return executables
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLVM:
|
||||
cargo: Cargo
|
||||
|
||||
def resolve_tool(self, name: str) -> str:
|
||||
exe = self.cargo.rustlib_dir / 'bin' / name
|
||||
if exe.exists():
|
||||
return str(exe)
|
||||
|
||||
if not shutil.which(name):
|
||||
# Show a user-friendly warning
|
||||
raise Exception(' '.join([
|
||||
f"It appears that you don't have `{name}` installed.",
|
||||
"Please execute `rustup component add llvm-tools-preview`,",
|
||||
"or install it via your package manager of choice.",
|
||||
"LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
|
||||
]))
|
||||
|
||||
return name
|
||||
|
||||
def profdata(self, input_dir: Path, output_profdata: Path):
|
||||
profraws = [f for f in input_dir.iterdir() if f.suffix == '.profraw']
|
||||
if not profraws:
|
||||
raise Exception(f'No profraw files found at {input_dir}')
|
||||
|
||||
with open(input_dir / 'profraw.list', 'w') as input_files:
|
||||
profraw_mtime = 0
|
||||
for profraw in profraws:
|
||||
profraw_mtime = max(profraw_mtime, profraw.stat().st_mtime_ns)
|
||||
print(profraw, file=input_files)
|
||||
input_files.flush()
|
||||
|
||||
try:
|
||||
profdata_mtime = output_profdata.stat().st_mtime_ns
|
||||
except FileNotFoundError:
|
||||
profdata_mtime = 0
|
||||
|
||||
# An obvious make-ish optimization
|
||||
if profraw_mtime >= profdata_mtime:
|
||||
subprocess.check_call([
|
||||
self.resolve_tool('llvm-profdata'),
|
||||
'merge',
|
||||
'-sparse',
|
||||
f'-input-files={input_files.name}',
|
||||
f'-output={output_profdata}',
|
||||
])
|
||||
|
||||
def _cov(self,
|
||||
*extras,
|
||||
subcommand: str,
|
||||
profdata: Path,
|
||||
objects: List[str],
|
||||
sources: List[str],
|
||||
demangler: Optional[str] = None) -> None:
|
||||
|
||||
cwd = self.cargo.cwd
|
||||
objects = list(intersperse('-object', objects))
|
||||
extras = list(extras)
|
||||
|
||||
# For some reason `rustc` produces relative paths to src files,
|
||||
# so we force it to cut the $PWD prefix.
|
||||
# see: https://github.com/rust-lang/rust/issues/34701#issuecomment-739809584
|
||||
if sources:
|
||||
extras.append(f'-path-equivalence=.,{cwd.resolve()}')
|
||||
|
||||
if demangler:
|
||||
extras.append(f'-Xdemangler={demangler}')
|
||||
|
||||
cmd = [
|
||||
self.resolve_tool('llvm-cov'),
|
||||
subcommand, # '-dump-collected-paths', # classified debug flag
|
||||
'-instr-profile',
|
||||
str(profdata),
|
||||
*extras,
|
||||
*objects,
|
||||
*sources,
|
||||
]
|
||||
subprocess.check_call(cmd, cwd=cwd)
|
||||
|
||||
def cov_report(self, **kwargs) -> None:
|
||||
self._cov(subcommand='report', **kwargs)
|
||||
|
||||
def cov_export(self, *, kind: str, **kwargs) -> None:
|
||||
extras = [f'-format={kind}']
|
||||
self._cov(subcommand='export', *extras, **kwargs)
|
||||
|
||||
def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
|
||||
extras = [f'-format={kind}']
|
||||
if output_dir:
|
||||
extras.append(f'-output-dir={output_dir}')
|
||||
|
||||
self._cov(subcommand='show', *extras, **kwargs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Report(ABC):
|
||||
""" Common properties of a coverage report """
|
||||
|
||||
llvm: LLVM
|
||||
demangler: str
|
||||
profdata: Path
|
||||
objects: List[str]
|
||||
sources: List[str]
|
||||
|
||||
def _common_kwargs(self):
|
||||
return dict(profdata=self.profdata,
|
||||
objects=self.objects,
|
||||
sources=self.sources,
|
||||
demangler=self.demangler)
|
||||
|
||||
@abstractmethod
|
||||
def generate(self):
|
||||
pass
|
||||
|
||||
def open(self):
|
||||
# Do nothing by default
|
||||
pass
|
||||
|
||||
|
||||
class SummaryReport(Report):
|
||||
def generate(self):
|
||||
self.llvm.cov_report(**self._common_kwargs())
|
||||
|
||||
|
||||
class TextReport(Report):
|
||||
def generate(self):
|
||||
self.llvm.cov_show(kind='text', **self._common_kwargs())
|
||||
|
||||
|
||||
class LcovReport(Report):
|
||||
def generate(self):
|
||||
self.llvm.cov_export(kind='lcov', **self._common_kwargs())
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlReport(Report):
|
||||
output_dir: Path
|
||||
|
||||
def generate(self):
|
||||
self.llvm.cov_show(kind='html', output_dir=self.output_dir, **self._common_kwargs())
|
||||
print(f'HTML report is located at `{self.output_dir}`')
|
||||
|
||||
def open(self):
|
||||
tool = dict(linux='xdg-open', darwin='open').get(sys.platform)
|
||||
if not tool:
|
||||
raise Exception(f'Unknown platform {sys.platform}')
|
||||
|
||||
subprocess.check_call([tool, self.output_dir / 'index.html'],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GithubPagesReport(HtmlReport):
|
||||
output_dir: Path
|
||||
commit_url: str
|
||||
|
||||
def generate(self):
|
||||
def index_path(path):
|
||||
return path / 'index.html'
|
||||
|
||||
common = self._common_kwargs()
|
||||
# Provide default sources if there's none
|
||||
common.setdefault('sources', ['.'])
|
||||
|
||||
self.llvm.cov_show(kind='html', output_dir=self.output_dir, **common)
|
||||
shutil.copy(index_path(self.output_dir), self.output_dir / 'local.html')
|
||||
|
||||
with TemporaryDirectory() as tmp:
|
||||
output_dir = Path(tmp)
|
||||
args = dict(common, sources=[])
|
||||
self.llvm.cov_show(kind='html', output_dir=output_dir, **args)
|
||||
shutil.copy(index_path(output_dir), self.output_dir / 'all.html')
|
||||
|
||||
with open(index_path(self.output_dir), 'w') as index:
|
||||
commit_sha = self.commit_url.rsplit('/', maxsplit=1)[-1][:10]
|
||||
|
||||
html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Coverage ({commit_sha})</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>
|
||||
Coverage report for commit
|
||||
<a href="{self.commit_url}">
|
||||
{commit_sha}
|
||||
</a>
|
||||
</h1>
|
||||
|
||||
<p>
|
||||
<a href="./local.html">
|
||||
<b>Show only local sources</b>
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
<a href="./all.html">
|
||||
Show all sources (including dependencies)
|
||||
</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
index.write(dedent(html))
|
||||
|
||||
print(f'HTML report is located at `{self.output_dir}`')
|
||||
|
||||
|
||||
class State:
|
||||
def __init__(self, cwd: Path, top_dir: Optional[Path], profraw_prefix: Optional[str]):
|
||||
# Use hostname by default
|
||||
profraw_prefix = profraw_prefix or '%h'
|
||||
|
||||
self.cwd = cwd
|
||||
self.cargo = Cargo(self.cwd)
|
||||
self.llvm = LLVM(self.cargo)
|
||||
|
||||
self.top_dir = top_dir or self.cargo.target_dir / 'coverage'
|
||||
self.report_dir = self.top_dir / 'report'
|
||||
|
||||
# Directory for raw coverage data emitted by executables
|
||||
self.profraw_dir = self.top_dir / 'profraw'
|
||||
self.profraw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Aggregated coverage data
|
||||
self.profdata_file = self.top_dir / 'coverage.profdata'
|
||||
|
||||
# Dump all coverage data files into a dedicated directory.
|
||||
# Each filename is parameterized by PID & executable's signature.
|
||||
os.environ['LLVM_PROFILE_FILE'] = str(self.profraw_dir /
|
||||
f'cov-{profraw_prefix}-%p-%m.profraw')
|
||||
|
||||
os.environ['RUSTFLAGS'] = ' '.join([
|
||||
os.environ.get('RUSTFLAGS', ''),
|
||||
# Enable LLVM's source-based coverage
|
||||
# see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html
|
||||
# see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html
|
||||
'-Zinstrument-coverage',
|
||||
# Link every bit of code to prevent "holes" in coverage report
|
||||
# see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code
|
||||
'-Clink-dead-code',
|
||||
# Some of the paths that `rustc` embeds into binaries are absolute, others are relative.
|
||||
# The point is, we can't have both, because depending on `-path-equivalence`, `llvm-cov`
|
||||
# either will cripple absolute paths or won't be able to show relative paths at all.
|
||||
# There's no way to turn relative paths into absolute, so we strip $PWD prefix.
|
||||
# Only source files of deps (e.g. `$HOME/.cargo`) will keep their absolute paths,
|
||||
# but we won't include them in report by default (but see `--all`).
|
||||
f'--remap-path-prefix {self.cwd}=',
|
||||
])
|
||||
|
||||
# XXX: God, have mercy on our souls...
|
||||
# see: https://github.com/rust-lang/rust/pull/90132
|
||||
os.environ['RUSTC_BOOTSTRAP'] = '1'
|
||||
|
||||
def do_run(self, args):
|
||||
subprocess.check_call([*args.command, *args.args])
|
||||
|
||||
def do_report(self, args):
|
||||
if args.all and args.sources:
|
||||
raise Exception('--all should not be used with sources')
|
||||
|
||||
# see man for `llvm-cov show [sources]`
|
||||
if args.all:
|
||||
sources = []
|
||||
elif not args.sources:
|
||||
sources = ['.']
|
||||
else:
|
||||
sources = args.sources
|
||||
|
||||
print('* Merging profraw files')
|
||||
self.llvm.profdata(self.profraw_dir, self.profdata_file)
|
||||
|
||||
objects = []
|
||||
if args.input_objects:
|
||||
print('* Collecting object files using --input-objects')
|
||||
with open(args.input_objects) as f:
|
||||
objects.extend(f.read().splitlines(keepends=False))
|
||||
if args.cargo_objects == 'true' or (args.cargo_objects == 'auto'
|
||||
and not args.input_objects):
|
||||
print('* Collecting object files using cargo')
|
||||
objects.extend(self.cargo.binaries(args.profile))
|
||||
|
||||
params = dict(llvm=self.llvm,
|
||||
demangler=find_demangler(args.demangler),
|
||||
profdata=self.profdata_file,
|
||||
objects=objects,
|
||||
sources=sources)
|
||||
|
||||
formats = {
|
||||
'html':
|
||||
lambda: HtmlReport(**params, output_dir=self.report_dir),
|
||||
'text':
|
||||
lambda: TextReport(**params),
|
||||
'lcov':
|
||||
lambda: LcovReport(**params),
|
||||
'summary':
|
||||
lambda: SummaryReport(**params),
|
||||
'github':
|
||||
lambda: GithubPagesReport(
|
||||
**params, output_dir=self.report_dir, commit_url=args.commit_url),
|
||||
}
|
||||
|
||||
report = formats.get(args.format)()
|
||||
if not report:
|
||||
raise Exception('Format `{args.format}` is not supported')
|
||||
|
||||
print(f'* Rendering coverage report ({args.format})')
|
||||
report.generate()
|
||||
|
||||
if args.open:
|
||||
print('* Opening the report')
|
||||
report.open()
|
||||
|
||||
def do_clean(self, args):
|
||||
# Wipe everything if no filters have been provided
|
||||
if not (args.report or args.prof):
|
||||
shutil.rmtree(self.top_dir, ignore_errors=True)
|
||||
else:
|
||||
if args.report:
|
||||
shutil.rmtree(self.report_dir, ignore_errors=True)
|
||||
if args.prof:
|
||||
self.profdata_file.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def main():
|
||||
app = sys.argv[0]
|
||||
example = f"""
|
||||
prerequisites:
|
||||
# alternatively, install a system package for `llvm-tools`
|
||||
rustup component add llvm-tools-preview
|
||||
|
||||
self-contained example:
|
||||
{app} run make
|
||||
{app} run pipenv run pytest test_runner
|
||||
{app} run cargo test
|
||||
{app} report --open
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Coverage report builder',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=example)
|
||||
parser.add_argument('--dir', type=Path, help='output directory')
|
||||
parser.add_argument('--profraw-prefix', metavar='STRING', type=str)
|
||||
|
||||
commands = parser.add_subparsers(title='commands', dest='subparser_name')
|
||||
|
||||
p_run = commands.add_parser('run', help='run a command with magic env')
|
||||
p_run.add_argument('command', nargs=1)
|
||||
p_run.add_argument('args', nargs=argparse.REMAINDER)
|
||||
|
||||
p_report = commands.add_parser('report', help='generate a coverage report')
|
||||
p_report.add_argument('--profile',
|
||||
default='debug',
|
||||
choices=('debug', 'release'),
|
||||
help='cargo build profile')
|
||||
p_report.add_argument('--format',
|
||||
default='html',
|
||||
choices=('html', 'text', 'summary', 'lcov', 'github'),
|
||||
help='report format')
|
||||
p_report.add_argument('--input-objects',
|
||||
metavar='FILE',
|
||||
type=Path,
|
||||
help='file containing list of binaries')
|
||||
p_report.add_argument('--cargo-objects',
|
||||
default='auto',
|
||||
choices=('auto', 'true', 'false'),
|
||||
help='use cargo for auto discovery of binaries')
|
||||
p_report.add_argument('--commit-url', type=str, help='required for --format=github')
|
||||
p_report.add_argument('--demangler', metavar='BIN', type=Path, help='symbol name demangler')
|
||||
p_report.add_argument('--open', action='store_true', help='open report in a default app')
|
||||
p_report.add_argument('--all', action='store_true', help='show everything, e.g. deps')
|
||||
p_report.add_argument('sources', nargs='*', type=Path, help='source file or directory')
|
||||
|
||||
p_clean = commands.add_parser('clean', help='wipe coverage artifacts')
|
||||
p_clean.add_argument('--report', action='store_true', help='pick generated report')
|
||||
p_clean.add_argument('--prof', action='store_true', help='pick *.profdata & *.profraw')
|
||||
|
||||
args = parser.parse_args()
|
||||
state = State(cwd=Path.cwd(), top_dir=args.dir, profraw_prefix=args.profraw_prefix)
|
||||
|
||||
commands = {
|
||||
'run': state.do_run,
|
||||
'report': state.do_report,
|
||||
'clean': state.do_clean,
|
||||
}
|
||||
|
||||
action = commands.get(args.subparser_name)
|
||||
if action:
|
||||
action(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# this is a shortcut script to avoid duplication in CI
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
|
||||
git clone https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git
|
||||
cd zenith-perf-data
|
||||
mkdir -p reports/
|
||||
mkdir -p data/$REPORT_TO
|
||||
|
||||
cp $REPORT_FROM/* data/$REPORT_TO
|
||||
|
||||
echo "Generating report"
|
||||
pipenv run python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html
|
||||
echo "Uploading perf result"
|
||||
git add data reports
|
||||
git \
|
||||
-c "user.name=vipvap" \
|
||||
-c "user.email=vipvap@zenith.tech" \
|
||||
commit \
|
||||
--author="vipvap <vipvap@zenith.tech>" \
|
||||
-m "add performance test result for $GITHUB_SHA zenith revision"
|
||||
|
||||
git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git master
|
||||
@@ -1,207 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional, Tuple, cast
|
||||
from jinja2 import Template
|
||||
|
||||
# skip 'input' columns. They are included in the header and just blow the table
|
||||
EXCLUDE_COLUMNS = frozenset({
|
||||
'scale',
|
||||
'duration',
|
||||
'number_of_clients',
|
||||
'number_of_threads',
|
||||
'init_start_timestamp',
|
||||
'init_end_timestamp',
|
||||
'run_start_timestamp',
|
||||
'run_end_timestamp',
|
||||
})
|
||||
|
||||
KEY_EXCLUDE_FIELDS = frozenset({
|
||||
'init_start_timestamp',
|
||||
'init_end_timestamp',
|
||||
'run_start_timestamp',
|
||||
'run_end_timestamp',
|
||||
})
|
||||
NEGATIVE_COLOR = 'negative'
|
||||
POSITIVE_COLOR = 'positive'
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuitRun:
|
||||
revision: str
|
||||
values: Dict[str, Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuitRuns:
|
||||
platform: str
|
||||
suit: str
|
||||
common_columns: List[Tuple[str, str]]
|
||||
value_columns: List[str]
|
||||
runs: List[SuitRun]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowValue:
|
||||
value: str
|
||||
color: str
|
||||
ratio: str
|
||||
|
||||
|
||||
def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
|
||||
value_columns = []
|
||||
common_columns = []
|
||||
for item in values:
|
||||
if item['name'] in KEY_EXCLUDE_FIELDS:
|
||||
continue
|
||||
if item['report'] != 'test_param':
|
||||
value_columns.append(cast(str, item['name']))
|
||||
else:
|
||||
common_columns.append((cast(str, item['name']), cast(str, item['value'])))
|
||||
value_columns.sort()
|
||||
common_columns.sort(key=lambda x: x[0]) # sort by name
|
||||
return common_columns, value_columns
|
||||
|
||||
|
||||
def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
|
||||
color = ''
|
||||
sign = '+' if ratio > 0 else ''
|
||||
if abs(ratio) < 0.05:
|
||||
return f' ({sign}{ratio:.2f})', color
|
||||
|
||||
if report not in {'test_param', 'higher_is_better', 'lower_is_better'}:
|
||||
raise ValueError(f'Unknown report type: {report}')
|
||||
|
||||
if report == 'test_param':
|
||||
return f'{ratio:.2f}', color
|
||||
|
||||
if ratio > 0:
|
||||
if report == 'higher_is_better':
|
||||
color = POSITIVE_COLOR
|
||||
elif report == 'lower_is_better':
|
||||
color = NEGATIVE_COLOR
|
||||
elif ratio < 0:
|
||||
if report == 'higher_is_better':
|
||||
color = NEGATIVE_COLOR
|
||||
elif report == 'lower_is_better':
|
||||
color = POSITIVE_COLOR
|
||||
|
||||
return f' ({sign}{ratio:.2f})', color
|
||||
|
||||
|
||||
def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
|
||||
for item in suit_run.values['data']:
|
||||
if item['name'] == name:
|
||||
return cast(Dict[str, Any], item)
|
||||
return None
|
||||
|
||||
|
||||
def get_row_values(columns: List[str], run_result: SuitRun,
|
||||
prev_result: Optional[SuitRun]) -> List[RowValue]:
|
||||
row_values = []
|
||||
for column in columns:
|
||||
current_value = extract_value(column, run_result)
|
||||
if current_value is None:
|
||||
# should never happen
|
||||
raise ValueError(f'{column} not found in {run_result.values}')
|
||||
|
||||
value = current_value["value"]
|
||||
if isinstance(value, float):
|
||||
value = f'{value:.2f}'
|
||||
|
||||
if prev_result is None:
|
||||
row_values.append(RowValue(value, '', ''))
|
||||
continue
|
||||
|
||||
prev_value = extract_value(column, prev_result)
|
||||
if prev_value is None:
|
||||
# this might happen when new metric is added and there is no value for it in previous run
|
||||
# let this be here, TODO add proper handling when this actually happens
|
||||
raise ValueError(f'{column} not found in previous result')
|
||||
ratio = float(value) / float(prev_value['value']) - 1
|
||||
ratio_display, color = format_ratio(ratio, current_value['report'])
|
||||
row_values.append(RowValue(value, color, ratio_display))
|
||||
return row_values
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuiteRunTableRow:
|
||||
revision: str
|
||||
values: List[RowValue]
|
||||
|
||||
|
||||
def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
|
||||
rows = []
|
||||
prev_run = None
|
||||
for run in runs:
|
||||
rows.append(
|
||||
SuiteRunTableRow(revision=run.revision,
|
||||
values=get_row_values(value_columns, run, prev_run)))
|
||||
prev_run = run
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
input_dir = Path(args.input_dir)
|
||||
grouped_runs: Dict[str, SuitRuns] = {}
|
||||
# we have files in form: <ctr>_<rev>.json
|
||||
# fill them in the hashmap so we have grouped items for the
|
||||
# same run configuration (scale, duration etc.) ordered by counter.
|
||||
for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])):
|
||||
run_data = json.loads(item.read_text())
|
||||
revision = run_data['revision']
|
||||
|
||||
for suit_result in run_data['result']:
|
||||
key = "{}{}".format(run_data['platform'], suit_result['suit'])
|
||||
# pack total duration as a synthetic value
|
||||
total_duration = suit_result['total_duration']
|
||||
suit_result['data'].append({
|
||||
'name': 'total_duration',
|
||||
'value': total_duration,
|
||||
'unit': 's',
|
||||
'report': 'lower_is_better',
|
||||
})
|
||||
common_columns, value_columns = get_columns(suit_result['data'])
|
||||
|
||||
grouped_runs.setdefault(
|
||||
key,
|
||||
SuitRuns(
|
||||
platform=run_data['platform'],
|
||||
suit=suit_result['suit'],
|
||||
common_columns=common_columns,
|
||||
value_columns=value_columns,
|
||||
runs=[],
|
||||
),
|
||||
)
|
||||
|
||||
grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
|
||||
context = {}
|
||||
for result in grouped_runs.values():
|
||||
suit = result.suit
|
||||
context[suit] = {
|
||||
'common_columns': result.common_columns,
|
||||
'value_columns': result.value_columns,
|
||||
'platform': result.platform,
|
||||
# reverse the order so newest results are on top of the table
|
||||
'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
|
||||
}
|
||||
|
||||
template = Template((Path(__file__).parent / 'perf_report_template.html').read_text())
|
||||
|
||||
Path(args.out).write_text(template.render(context=context))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--input-dir',
|
||||
dest='input_dir',
|
||||
required=True,
|
||||
help='Directory with jsons generated by the test suite',
|
||||
)
|
||||
parser.add_argument('--out', required=True, help='Output html file path')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -1,136 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from contextlib import contextmanager
|
||||
from tempfile import TemporaryDirectory
|
||||
from pathlib import Path
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def absolute_path(path):
|
||||
return Path(path).resolve()
|
||||
|
||||
|
||||
def relative_path(path):
|
||||
path = Path(path)
|
||||
if path.is_absolute():
|
||||
raise Exception(f'path `{path}` must be relative!')
|
||||
return path
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chdir(cwd: Path):
|
||||
old = os.getcwd()
|
||||
os.chdir(cwd)
|
||||
try:
|
||||
yield cwd
|
||||
finally:
|
||||
os.chdir(old)
|
||||
|
||||
|
||||
def run(cmd, *args, **kwargs):
|
||||
print('$', ' '.join(cmd))
|
||||
subprocess.check_call(cmd, *args, **kwargs)
|
||||
|
||||
|
||||
class GitRepo:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.cwd = TemporaryDirectory()
|
||||
|
||||
subprocess.check_call([
|
||||
'git',
|
||||
'clone',
|
||||
str(url),
|
||||
self.cwd.name,
|
||||
])
|
||||
|
||||
def is_dirty(self):
|
||||
res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
|
||||
return bool(res)
|
||||
|
||||
def update(self, message, action, branch=None):
|
||||
with chdir(self.cwd.name):
|
||||
if not branch:
|
||||
cmd = ['git', 'branch', '--show-current']
|
||||
branch = subprocess.check_output(cmd, text=True).strip()
|
||||
|
||||
# Run action in repo's directory
|
||||
action()
|
||||
|
||||
run(['git', 'add', '.'])
|
||||
|
||||
if not self.is_dirty():
|
||||
print('No changes detected, quitting')
|
||||
return
|
||||
|
||||
run([
|
||||
'git',
|
||||
'-c',
|
||||
'user.name=vipvap',
|
||||
'-c',
|
||||
'user.email=vipvap@zenith.tech',
|
||||
'commit',
|
||||
'--author="vipvap <vipvap@zenith.tech>"',
|
||||
f'--message={message}',
|
||||
])
|
||||
|
||||
for _ in range(5):
|
||||
try:
|
||||
run(['git', 'fetch', 'origin', branch])
|
||||
run(['git', 'rebase', f'origin/{branch}'])
|
||||
run(['git', 'push', 'origin', branch])
|
||||
return
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
|
||||
|
||||
raise Exception(f'failed to update branch `{branch}`')
|
||||
|
||||
|
||||
def do_copy(args):
|
||||
src = args.src
|
||||
dst = args.dst
|
||||
|
||||
try:
|
||||
if src.is_dir():
|
||||
shutil.copytree(src, dst)
|
||||
else:
|
||||
shutil.copy(src, dst)
|
||||
except FileExistsError:
|
||||
if args.forbid_overwrite:
|
||||
raise
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Git upload tool')
|
||||
parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
|
||||
parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
|
||||
|
||||
commands = parser.add_subparsers(title='commands', dest='subparser_name')
|
||||
|
||||
p_copy = commands.add_parser('copy', help='copy file into the repo')
|
||||
p_copy.add_argument('src', type=absolute_path, help='source path')
|
||||
p_copy.add_argument('dst', type=relative_path, help='relative dest path')
|
||||
p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
commands = {
|
||||
'copy': do_copy,
|
||||
}
|
||||
|
||||
action = commands.get(args.subparser_name)
|
||||
if action:
|
||||
message = args.message or 'update'
|
||||
GitRepo(args.repo).update(message, lambda: action(args))
|
||||
else:
|
||||
parser.print_usage()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,52 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
|
||||
<body>
|
||||
<style>
|
||||
table,
|
||||
th,
|
||||
td {
|
||||
border: 1px solid black;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
.positive {
|
||||
background-color: rgba(0, 255, 0, 0.8)
|
||||
}
|
||||
|
||||
.negative {
|
||||
background-color: rgba(255, 0, 0, 0.65)
|
||||
}
|
||||
</style>
|
||||
|
||||
<h2>Zenith Performance Tests</h2>
|
||||
|
||||
{% for suit_name, suit_data in context.items() %}
|
||||
<h3>Runs for {{ suit_name }} </h3>
|
||||
<b>platform:</b> {{ suit_data.platform }}<br>
|
||||
{% for common_column_name, common_column_value in suit_data.common_columns %}
|
||||
<b>{{ common_column_name }}</b>: {{ common_column_value }}<br>
|
||||
{% endfor %}
|
||||
<br>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>revision</th>
|
||||
{% for column_name in suit_data.value_columns %}
|
||||
<th>{{ column_name }}</th>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% for row in suit_data.rows %}
|
||||
<tr>
|
||||
<td><a href=https://github.com/zenithdb/zenith/commit/{{ row.revision }}>{{ row.revision[:6] }}</a></td>
|
||||
{% for column_value in row.values %}
|
||||
<td class="{{ column_value.color }}">{{ column_value.value }}{{column_value.ratio}}</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
{% endfor %}
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
22
test_runner/Pipfile
Normal file
22
test_runner/Pipfile
Normal file
@@ -0,0 +1,22 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
pytest = ">=6.0.0"
|
||||
psycopg2 = "*"
|
||||
typing-extensions = "*"
|
||||
pyjwt = {extras = ["crypto"], version = "*"}
|
||||
requests = "*"
|
||||
pytest-xdist = "*"
|
||||
asyncpg = "*"
|
||||
|
||||
[dev-packages]
|
||||
yapf = "*"
|
||||
flake8 = "*"
|
||||
mypy = "*"
|
||||
|
||||
[requires]
|
||||
# we need at least 3.6, but pipenv doesn't allow to say this directly
|
||||
python_version = "3"
|
||||
376
test_runner/Pipfile.lock
generated
Normal file
376
test_runner/Pipfile.lock
generated
Normal file
@@ -0,0 +1,376 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "3cdc048691824d0b93912b6b78a0aa01dc98f278212c1badb0cc2edbd2103c3a"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"asyncpg": {
|
||||
"hashes": [
|
||||
"sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
|
||||
"sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
|
||||
"sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
|
||||
"sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
|
||||
"sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
|
||||
"sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
|
||||
"sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
|
||||
"sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
|
||||
"sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
|
||||
"sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
|
||||
"sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
|
||||
"sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
|
||||
"sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.24.0"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:2bbf76fd432960138b3ef6dda3dde0544f27cbf8546c458e60baf371917ba9ee",
|
||||
"sha256:50b1e4f8446b06f41be7dd6338db18e0990601dce795c2b1686458aa7e8fa7d8"
|
||||
],
|
||||
"version": "==2021.5.30"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:06c54a68935738d206570b20da5ef2b6b6d92b38ef3ec45c5422c0ebaf338d4d",
|
||||
"sha256:0c0591bee64e438883b0c92a7bed78f6290d40bf02e54c5bf0978eaf36061771",
|
||||
"sha256:19ca0dbdeda3b2615421d54bef8985f72af6e0c47082a8d26122adac81a95872",
|
||||
"sha256:22b9c3c320171c108e903d61a3723b51e37aaa8c81255b5e7ce102775bd01e2c",
|
||||
"sha256:26bb2549b72708c833f5abe62b756176022a7b9a7f689b571e74c8478ead51dc",
|
||||
"sha256:33791e8a2dc2953f28b8d8d300dde42dd929ac28f974c4b4c6272cb2955cb762",
|
||||
"sha256:3c8d896becff2fa653dc4438b54a5a25a971d1f4110b32bd3068db3722c80202",
|
||||
"sha256:4373612d59c404baeb7cbd788a18b2b2a8331abcc84c3ba40051fcd18b17a4d5",
|
||||
"sha256:487d63e1454627c8e47dd230025780e91869cfba4c753a74fda196a1f6ad6548",
|
||||
"sha256:48916e459c54c4a70e52745639f1db524542140433599e13911b2f329834276a",
|
||||
"sha256:4922cd707b25e623b902c86188aca466d3620892db76c0bdd7b99a3d5e61d35f",
|
||||
"sha256:55af55e32ae468e9946f741a5d51f9896da6b9bf0bbdd326843fec05c730eb20",
|
||||
"sha256:57e555a9feb4a8460415f1aac331a2dc833b1115284f7ded7278b54afc5bd218",
|
||||
"sha256:5d4b68e216fc65e9fe4f524c177b54964af043dde734807586cf5435af84045c",
|
||||
"sha256:64fda793737bc4037521d4899be780534b9aea552eb673b9833b01f945904c2e",
|
||||
"sha256:6d6169cb3c6c2ad50db5b868db6491a790300ade1ed5d1da29289d73bbe40b56",
|
||||
"sha256:7bcac9a2b4fdbed2c16fa5681356d7121ecabf041f18d97ed5b8e0dd38a80224",
|
||||
"sha256:80b06212075346b5546b0417b9f2bf467fea3bfe7352f781ffc05a8ab24ba14a",
|
||||
"sha256:818014c754cd3dba7229c0f5884396264d51ffb87ec86e927ef0be140bfdb0d2",
|
||||
"sha256:8eb687582ed7cd8c4bdbff3df6c0da443eb89c3c72e6e5dcdd9c81729712791a",
|
||||
"sha256:99f27fefe34c37ba9875f224a8f36e31d744d8083e00f520f133cab79ad5e819",
|
||||
"sha256:9f3e33c28cd39d1b655ed1ba7247133b6f7fc16fa16887b120c0c670e35ce346",
|
||||
"sha256:a8661b2ce9694ca01c529bfa204dbb144b275a31685a075ce123f12331be790b",
|
||||
"sha256:a9da7010cec5a12193d1af9872a00888f396aba3dc79186604a09ea3ee7c029e",
|
||||
"sha256:aedb15f0a5a5949ecb129a82b72b19df97bbbca024081ed2ef88bd5c0a610534",
|
||||
"sha256:b315d709717a99f4b27b59b021e6207c64620790ca3e0bde636a6c7f14618abb",
|
||||
"sha256:ba6f2b3f452e150945d58f4badd92310449876c4c954836cfb1803bdd7b422f0",
|
||||
"sha256:c33d18eb6e6bc36f09d793c0dc58b0211fccc6ae5149b808da4a62660678b156",
|
||||
"sha256:c9a875ce9d7fe32887784274dd533c57909b7b1dcadcc128a2ac21331a9765dd",
|
||||
"sha256:c9e005e9bd57bc987764c32a1bee4364c44fdc11a3cc20a40b93b444984f2b87",
|
||||
"sha256:d2ad4d668a5c0645d281dcd17aff2be3212bc109b33814bbb15c4939f44181cc",
|
||||
"sha256:d950695ae4381ecd856bcaf2b1e866720e4ab9a1498cba61c602e56630ca7195",
|
||||
"sha256:e22dcb48709fc51a7b58a927391b23ab37eb3737a98ac4338e2448bef8559b33",
|
||||
"sha256:e8c6a99be100371dbb046880e7a282152aa5d6127ae01783e37662ef73850d8f",
|
||||
"sha256:e9dc245e3ac69c92ee4c167fbdd7428ec1956d4e754223124991ef29eb57a09d",
|
||||
"sha256:eb687a11f0a7a1839719edd80f41e459cc5366857ecbed383ff376c4e3cc6afd",
|
||||
"sha256:eb9e2a346c5238a30a746893f23a9535e700f8192a68c07c0258e7ece6ff3728",
|
||||
"sha256:ed38b924ce794e505647f7c331b22a693bee1538fdf46b0222c4717b42f744e7",
|
||||
"sha256:f0010c6f9d1a4011e429109fda55a225921e3206e7f62a0c22a35344bfd13cca",
|
||||
"sha256:f0c5d1acbfca6ebdd6b1e3eded8d261affb6ddcf2186205518f1428b8569bb99",
|
||||
"sha256:f10afb1004f102c7868ebfe91c28f4a712227fe4cb24974350ace1f90e1febbf",
|
||||
"sha256:f174135f5609428cc6e1b9090f9268f5c8935fddb1b25ccb8255a2d50de6789e",
|
||||
"sha256:f3ebe6e73c319340830a9b2825d32eb6d8475c1dac020b4f0aa774ee3b898d1c",
|
||||
"sha256:f627688813d0a4140153ff532537fbe4afea5a3dffce1f9deb7f91f848a832b5",
|
||||
"sha256:fd4305f86f53dfd8cd3522269ed7fc34856a8ee3709a5e28b2836b2db9d4cd69"
|
||||
],
|
||||
"version": "==1.14.6"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:5d209c0a931f215cee683b6445e2d77677e7e75e159f78def0db09d68fafcaa6",
|
||||
"sha256:5ec46d183433dcbd0ab716f2d7f29d8dee50505b3fdb40c6b985c7c4f5a3591f"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.6"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:0a7dcbcd3f1913f664aca35d47c1331fce738d44ec34b7be8b9d332151b0b01e",
|
||||
"sha256:1eb7bb0df6f6f583dd8e054689def236255161ebbcf62b226454ab9ec663746b",
|
||||
"sha256:21ca464b3a4b8d8e86ba0ee5045e103a1fcfac3b39319727bc0fc58c09c6aff7",
|
||||
"sha256:34dae04a0dce5730d8eb7894eab617d8a70d0c97da76b905de9efb7128ad7085",
|
||||
"sha256:3520667fda779eb788ea00080124875be18f2d8f0848ec00733c0ec3bb8219fc",
|
||||
"sha256:3c4129fc3fdc0fa8e40861b5ac0c673315b3c902bbdc05fc176764815b43dd1d",
|
||||
"sha256:3fa3a7ccf96e826affdf1a0a9432be74dc73423125c8f96a909e3835a5ef194a",
|
||||
"sha256:5b0fbfae7ff7febdb74b574055c7466da334a5371f253732d7e2e7525d570498",
|
||||
"sha256:695104a9223a7239d155d7627ad912953b540929ef97ae0c34c7b8bf30857e89",
|
||||
"sha256:8695456444f277af73a4877db9fc979849cd3ee74c198d04fc0776ebc3db52b9",
|
||||
"sha256:94cc5ed4ceaefcbe5bf38c8fba6a21fc1d365bb8fb826ea1688e3370b2e24a1c",
|
||||
"sha256:94fff993ee9bc1b2440d3b7243d488c6a3d9724cc2b09cdb297f6a886d040ef7",
|
||||
"sha256:9965c46c674ba8cc572bc09a03f4c649292ee73e1b683adb1ce81e82e9a6a0fb",
|
||||
"sha256:a00cf305f07b26c351d8d4e1af84ad7501eca8a342dedf24a7acb0e7b7406e14",
|
||||
"sha256:a305600e7a6b7b855cd798e00278161b681ad6e9b7eca94c721d5f588ab212af",
|
||||
"sha256:cd65b60cfe004790c795cc35f272e41a3df4631e2fb6b35aa7ac6ef2859d554e",
|
||||
"sha256:d2a6e5ef66503da51d2110edf6c403dc6b494cc0082f85db12f54e9c5d4c3ec5",
|
||||
"sha256:d9ec0e67a14f9d1d48dd87a2531009a9b251c02ea42851c060b25c782516ff06",
|
||||
"sha256:f44d141b8c4ea5eb4dbc9b3ad992d45580c1d22bf5e24363f2fbf50c2d7ae8a7"
|
||||
],
|
||||
"version": "==3.4.8"
|
||||
},
|
||||
"execnet": {
|
||||
"hashes": [
|
||||
"sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
|
||||
"sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.9.0"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:14475042e284991034cb48e06f6851428fb14c4dc953acd9be9a5e95c7b6dd7a",
|
||||
"sha256:467fbad99067910785144ce333826c71fb0e63a425657295239737f7ecd125f3"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.2"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
|
||||
"sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==21.0"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"psycopg2": {
|
||||
"hashes": [
|
||||
"sha256:079d97fc22de90da1d370c90583659a9f9a6ee4007355f5825e5f1c70dffc1fa",
|
||||
"sha256:2087013c159a73e09713294a44d0c8008204d06326006b7f652bef5ace66eebb",
|
||||
"sha256:2c992196719fadda59f72d44603ee1a2fdcc67de097eea38d41c7ad9ad246e62",
|
||||
"sha256:7640e1e4d72444ef012e275e7b53204d7fab341fb22bc76057ede22fe6860b25",
|
||||
"sha256:7f91312f065df517187134cce8e395ab37f5b601a42446bdc0f0d51773621854",
|
||||
"sha256:830c8e8dddab6b6716a4bf73a09910c7954a92f40cf1d1e702fb93c8a919cc56",
|
||||
"sha256:89409d369f4882c47f7ea20c42c5046879ce22c1e4ea20ef3b00a4dfc0a7f188",
|
||||
"sha256:bf35a25f1aaa8a3781195595577fcbb59934856ee46b4f252f56ad12b8043bcf",
|
||||
"sha256:de5303a6f1d0a7a34b9d40e4d3bef684ccc44a49bbe3eb85e3c0bffb4a131b7c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
|
||||
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.10.0"
|
||||
},
|
||||
"pycparser": {
|
||||
"hashes": [
|
||||
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
|
||||
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
},
|
||||
"pyjwt": {
|
||||
"extras": [
|
||||
"crypto"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:934d73fbba91b0483d3857d1aff50e96b2a892384ee2c17417ed3203f173fca1",
|
||||
"sha256:fba44e7898bbca160a2b2b501f492824fc8382485d3a6f11ba5d0c1937ce6130"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.1.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
|
||||
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.7"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
|
||||
"sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.2.5"
|
||||
},
|
||||
"pytest-forked": {
|
||||
"hashes": [
|
||||
"sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
|
||||
"sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"pytest-xdist": {
|
||||
"hashes": [
|
||||
"sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
|
||||
"sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.26.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:39fb8672126159acb139a7718dd10806104dec1e2f0f6c88aab05d17df10c8d4",
|
||||
"sha256:f57b4c16c62fa2760b7e3d97c35b255512fb6b59a259730f36ba32ce9f8e342f"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.6"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:07528381786f2a6237b061f6e96610a4167b226cb926e2aa2b6b1d78057c576b",
|
||||
"sha256:bf8fd333346d844f616e8d47905ef3a3384edae6b4e9beb0c5101e25e3110907"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.9.2"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"mypy": {
|
||||
"hashes": [
|
||||
"sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
|
||||
"sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
|
||||
"sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
|
||||
"sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
|
||||
"sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
|
||||
"sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
|
||||
"sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
|
||||
"sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
|
||||
"sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
|
||||
"sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
|
||||
"sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
|
||||
"sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
|
||||
"sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
|
||||
"sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
|
||||
"sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
|
||||
"sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
|
||||
"sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
|
||||
"sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
|
||||
"sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
|
||||
"sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
|
||||
"sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
|
||||
"sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
|
||||
"sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.910"
|
||||
},
|
||||
"mypy-extensions": {
|
||||
"hashes": [
|
||||
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
|
||||
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
|
||||
],
|
||||
"version": "==0.4.3"
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:514f76d918fcc0b55c6680472f0a37970994e07bbb80725808c17089be302068",
|
||||
"sha256:c389c1d06bf7904078ca03399a4816f974a1d590090fecea0c63ec26ebaf1cef"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.7.0"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:7893783d01b8a89811dd72d7dfd4d84ff098e5eed95cfa8905b22bbffe52efc3",
|
||||
"sha256:f5bc8ecabc05bb9d291eb5203d6810b49040f6ff446a756326104746cc00c1db"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.3.1"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"yapf": {
|
||||
"hashes": [
|
||||
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
|
||||
"sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.31.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,13 +3,18 @@
|
||||
This directory contains integration tests.
|
||||
|
||||
Prerequisites:
|
||||
- Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
|
||||
- Python 3.6 or later
|
||||
- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
|
||||
packages are stale, as it commonly happens, so manual installation is not
|
||||
recommended.
|
||||
Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
|
||||
command in the venv, e.g. `pipenv run pytest`.
|
||||
- Zenith and Postgres binaries
|
||||
- See the root [README.md](/README.md) for build directions
|
||||
- See the root README.md for build directions
|
||||
- Tests can be run from the git tree; or see the environment variables
|
||||
below to run from other directories.
|
||||
- The zenith git repo, including the postgres submodule
|
||||
(for some tests, e.g. `pg_regress`)
|
||||
(for some tests, e.g. pg_regress)
|
||||
|
||||
### Test Organization
|
||||
|
||||
@@ -30,15 +35,15 @@ be stored under a directory `test_output`.
|
||||
|
||||
You can run all the tests with:
|
||||
|
||||
`pipenv run pytest`
|
||||
`pytest`
|
||||
|
||||
If you want to run all the tests in a particular file:
|
||||
|
||||
`pipenv run pytest test_pgbench.py`
|
||||
`pytest test_pgbench.py`
|
||||
|
||||
If you want to run all tests that have the string "bench" in their names:
|
||||
|
||||
`pipenv run pytest -k bench`
|
||||
`pytest -k bench`
|
||||
|
||||
Useful environment variables:
|
||||
|
||||
@@ -48,8 +53,8 @@ Useful environment variables:
|
||||
should go.
|
||||
`TEST_SHARED_FIXTURES`: Try to re-use a single pageserver for all the tests.
|
||||
|
||||
Let stdout, stderr and `INFO` log messages go to the terminal instead of capturing them:
|
||||
`pytest -s --log-cli-level=INFO ...`
|
||||
Let stdout and stderr go to the terminal instead of capturing them:
|
||||
`pytest -s ...`
|
||||
(Note many tests capture subprocess outputs separately, so this may not
|
||||
show much.)
|
||||
|
||||
@@ -57,51 +62,44 @@ Exit after the first test failure:
|
||||
`pytest -x ...`
|
||||
(there are many more pytest options; run `pytest -h` to see them.)
|
||||
|
||||
### Writing a test
|
||||
|
||||
Every test needs a Zenith Environment, or ZenithEnv to operate in. A Zenith Environment
|
||||
is like a little cloud-in-a-box, and consists of a Pageserver, 0-N Safekeepers, and
|
||||
compute Postgres nodes. The connections between them can be configured to use JWT
|
||||
authentication tokens, and some other configuration options can be tweaked too.
|
||||
### Building new tests
|
||||
|
||||
The easiest way to get access to a Zenith Environment is by using the `zenith_simple_env`
|
||||
fixture. The 'simple' env may be shared across multiple tests, so don't shut down the nodes
|
||||
or make other destructive changes in that environment. Also don't assume that
|
||||
there are no tenants or branches or data in the cluster. For convenience, there is a
|
||||
branch called `empty`, though. The convention is to create a test-specific branch of
|
||||
that and load any test data there, instead of the 'main' branch.
|
||||
The tests make heavy use of pytest fixtures. You can read about how they work here: https://docs.pytest.org/en/stable/fixture.html
|
||||
|
||||
For more complicated cases, you can build a custom Zenith Environment, with the `zenith_env`
|
||||
fixture:
|
||||
Essentially, this means that each time you see a fixture named as an input parameter, the function with that name will be run and passed as a parameter to the function.
|
||||
|
||||
So this code:
|
||||
|
||||
```python
|
||||
def test_foobar(zenith_env_builder: ZenithEnvBuilder):
|
||||
# Prescribe the environment.
|
||||
# We want to have 3 safekeeper nodes, and use JWT authentication in the
|
||||
# connections to the page server
|
||||
zenith_env_builder.num_safekeepers = 3
|
||||
zenith_env_builder.set_pageserver_auth(True)
|
||||
|
||||
# Now create the environment. This initializes the repository, and starts
|
||||
# up the page server and the safekeepers
|
||||
env = zenith_env_builder.init()
|
||||
|
||||
# Run the test
|
||||
...
|
||||
def test_something(zenith_cli, pg_bin):
|
||||
pass
|
||||
```
|
||||
|
||||
For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html
|
||||
... will run the fixtures called `zenith_cli` and `pg_bin` and deliver those results to the test function.
|
||||
|
||||
At the end of a test, all the nodes in the environment are automatically stopped, so you
|
||||
don't need to worry about cleaning up. Logs and test data are preserved for the analysis,
|
||||
in a directory under `../test_output/<testname>`
|
||||
Fixtures can't be imported using the normal python syntax. Instead, use this:
|
||||
|
||||
### Before submitting a patch
|
||||
Ensure that you pass all [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
||||
```python
|
||||
pytest_plugins = ("fixtures.something")
|
||||
```
|
||||
|
||||
Also consider:
|
||||
That will make all the fixtures in the `fixtures/something.py` file available.
|
||||
|
||||
Anything that's likely to be used in multiple tests should be built into a fixture.
|
||||
|
||||
Note that fixtures can clean up after themselves if they use the `yield` syntax.
|
||||
Cleanup will happen even if the test fails (raises an unhandled exception).
|
||||
Python destructors, e.g. `__del__()` aren't recommended for cleanup.
|
||||
|
||||
|
||||
### Code quality
|
||||
|
||||
Before submitting a patch, please consider:
|
||||
|
||||
* Writing a couple of docstrings to clarify the reasoning behind a new test.
|
||||
* Adding more type hints to your code to avoid `Any`, especially:
|
||||
* For fixture parameters, they are not automatically deduced.
|
||||
* For function arguments and return values.
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Formatting the code with `yapf -r -i .` (TODO: implement an opt-in pre-commit hook for that).
|
||||
* (Optional) Typechecking the code with `mypy .`. Currently this mostly affects `fixtures/zenith_fixtures.py`.
|
||||
|
||||
The tools can be installed with `pipenv install --dev`.
|
||||
|
||||
@@ -1,22 +1,21 @@
|
||||
|
||||
from contextlib import closing
|
||||
from typing import Iterator
|
||||
from uuid import uuid4
|
||||
import psycopg2
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
from fixtures.zenith_fixtures import PortDistributor, Postgres, ZenithCli, ZenithPageserver, PgBin
|
||||
import pytest
|
||||
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
|
||||
zenith_env_builder.pageserver_auth_enabled = True
|
||||
env = zenith_env_builder.init()
|
||||
def test_pageserver_auth(pageserver_auth_enabled: ZenithPageserver):
|
||||
ps = pageserver_auth_enabled
|
||||
|
||||
ps = env.pageserver
|
||||
|
||||
tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant)
|
||||
invalid_tenant_token = env.auth_keys.generate_tenant_token(uuid4().hex)
|
||||
management_token = env.auth_keys.generate_management_token()
|
||||
tenant_token = ps.auth_keys.generate_tenant_token(ps.initial_tenant)
|
||||
invalid_tenant_token = ps.auth_keys.generate_tenant_token(uuid4().hex)
|
||||
management_token = ps.auth_keys.generate_management_token()
|
||||
|
||||
# this does not invoke auth check and only decodes jwt and checks it for validity
|
||||
# check both tokens
|
||||
@@ -24,41 +23,57 @@ def test_pageserver_auth(zenith_env_builder: ZenithEnvBuilder):
|
||||
ps.safe_psql("status", password=management_token)
|
||||
|
||||
# tenant can create branches
|
||||
ps.safe_psql(f"branch_create {env.initial_tenant} new1 main", password=tenant_token)
|
||||
ps.safe_psql(f"branch_create {ps.initial_tenant} new1 main", password=tenant_token)
|
||||
# console can create branches for tenant
|
||||
ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=management_token)
|
||||
ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=management_token)
|
||||
|
||||
# fail to create branch using token with different tenantid
|
||||
with pytest.raises(psycopg2.DatabaseError, match='Tenant id mismatch. Permission denied'):
|
||||
ps.safe_psql(f"branch_create {env.initial_tenant} new2 main", password=invalid_tenant_token)
|
||||
ps.safe_psql(f"branch_create {ps.initial_tenant} new2 main", password=invalid_tenant_token)
|
||||
|
||||
# create tenant using management token
|
||||
ps.safe_psql(f"tenant_create {uuid4().hex}", password=management_token)
|
||||
|
||||
# fail to create tenant using tenant token
|
||||
with pytest.raises(
|
||||
psycopg2.DatabaseError,
|
||||
match='Attempt to access management api with tenant scope. Permission denied'):
|
||||
with pytest.raises(psycopg2.DatabaseError, match='Attempt to access management api with tenant scope. Permission denied'):
|
||||
ps.safe_psql(f"tenant_create {uuid4().hex}", password=tenant_token)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('with_wal_acceptors', [False, True])
|
||||
def test_compute_auth_to_pageserver(zenith_env_builder: ZenithEnvBuilder, with_wal_acceptors: bool):
|
||||
zenith_env_builder.pageserver_auth_enabled = True
|
||||
if with_wal_acceptors:
|
||||
zenith_env_builder.num_safekeepers = 3
|
||||
env = zenith_env_builder.init()
|
||||
def test_compute_auth_to_pageserver(
|
||||
zenith_cli: ZenithCli,
|
||||
wa_factory,
|
||||
pageserver_auth_enabled: ZenithPageserver,
|
||||
repo_dir: str,
|
||||
with_wal_acceptors: bool,
|
||||
pg_bin: PgBin,
|
||||
port_distributor: PortDistributor,
|
||||
):
|
||||
ps = pageserver_auth_enabled
|
||||
# since we are in progress of refactoring protocols between compute safekeeper and page server
|
||||
# use hardcoded management token in safekeeper
|
||||
management_token = ps.auth_keys.generate_management_token()
|
||||
|
||||
branch = f"test_compute_auth_to_pageserver{with_wal_acceptors}"
|
||||
env.zenith_cli(["branch", branch, "main"])
|
||||
zenith_cli.run(["branch", branch, "empty"])
|
||||
if with_wal_acceptors:
|
||||
wa_factory.start_n_new(3, management_token)
|
||||
|
||||
pg = env.postgres.create_start(branch)
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# we rely upon autocommit after each statement
|
||||
# as waiting for acceptors happens there
|
||||
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
assert cur.fetchone() == (5000050000, )
|
||||
with Postgres(
|
||||
zenith_cli=zenith_cli,
|
||||
repo_dir=repo_dir,
|
||||
pg_bin=pg_bin,
|
||||
tenant_id=ps.initial_tenant,
|
||||
port=port_distributor.get_port(),
|
||||
).create_start(
|
||||
branch,
|
||||
wal_acceptors=wa_factory.get_connstrs() if with_wal_acceptors else None,
|
||||
) as pg:
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
# we rely upon autocommit after each statement
|
||||
# as waiting for acceptors happens there
|
||||
cur.execute('CREATE TABLE t(key int primary key, value text)')
|
||||
cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
cur.execute('SELECT sum(key) FROM t')
|
||||
assert cur.fetchone() == (5000050000, )
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import subprocess
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.utils import print_gc_result
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -13,27 +11,18 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Create a couple of branches off the main branch, at a historical point in time.
|
||||
#
|
||||
def test_branch_behind(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
def test_branch_behind(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
env.zenith_cli(["branch", "test_branch_behind", "empty"])
|
||||
zenith_cli.run(["branch", "test_branch_behind", "empty"])
|
||||
|
||||
pgmain = env.postgres.create_start('test_branch_behind')
|
||||
pgmain = postgres.create_start('test_branch_behind')
|
||||
log.info("postgres is running on 'test_branch_behind' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
main_cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = main_cur.fetchone()[0]
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)')
|
||||
|
||||
# keep some early lsn to test branch creation on out of date lsn
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
gced_lsn = main_cur.fetchone()[0]
|
||||
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
@@ -47,32 +36,32 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 200000) g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
log.info(f'LSN after 200100 rows: {lsn_b}')
|
||||
log.info(f'LSN after 100100 rows: {lsn_b}')
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
env.zenith_cli(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
zenith_cli.run(["branch", "test_branch_behind_hundred", "test_branch_behind@" + lsn_a])
|
||||
|
||||
# Insert many more rows. This generates enough WAL to fill a few segments.
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
FROM generate_series(1, 200000) g
|
||||
FROM generate_series(1, 100000) g
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
log.info(f'LSN after 400100 rows: {lsn_c}')
|
||||
log.info(f'LSN after 200100 rows: {lsn_c}')
|
||||
|
||||
# Branch at the point where only 200100 rows were inserted
|
||||
env.zenith_cli(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
# Branch at the point where only 200 rows were inserted
|
||||
zenith_cli.run(["branch", "test_branch_behind_more", "test_branch_behind@" + lsn_b])
|
||||
|
||||
pg_hundred = env.postgres.create_start("test_branch_behind_hundred")
|
||||
pg_more = env.postgres.create_start("test_branch_behind_more")
|
||||
pg_hundred = postgres.create_start("test_branch_behind_hundred")
|
||||
pg_more = postgres.create_start("test_branch_behind_more")
|
||||
|
||||
# On the 'hundred' branch, we should see only 100 rows
|
||||
hundred_pg_conn = pg_hundred.connect()
|
||||
@@ -84,43 +73,23 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
|
||||
more_pg_conn = pg_more.connect()
|
||||
more_cur = more_pg_conn.cursor()
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
assert more_cur.fetchone() == (100100, )
|
||||
|
||||
# All the rows are visible on the main branch
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
assert main_cur.fetchone() == (200100, )
|
||||
|
||||
# Check bad lsn's for branching
|
||||
|
||||
# branch at segment boundary
|
||||
env.zenith_cli(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
|
||||
pg = env.postgres.create_start("test_branch_segment_boundary")
|
||||
zenith_cli.run(["branch", "test_branch_segment_boundary", "test_branch_behind@0/3000000"])
|
||||
pg = postgres.create_start("test_branch_segment_boundary")
|
||||
cur = pg.connect().cursor()
|
||||
cur.execute('SELECT 1')
|
||||
assert cur.fetchone() == (1, )
|
||||
|
||||
# branch at pre-initdb lsn
|
||||
with pytest.raises(Exception, match="invalid branch start lsn"):
|
||||
env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
|
||||
|
||||
# check that we cannot create branch based on garbage collected data
|
||||
with closing(env.pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
|
||||
# call gc to advace latest_gc_cutoff_lsn
|
||||
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
|
||||
with pytest.raises(Exception, match="invalid branch start lsn"):
|
||||
# this gced_lsn is pretty random, so if gc is disabled this woudln't fail
|
||||
env.zenith_cli(["branch", "test_branch_create_fail", f"test_branch_behind@{gced_lsn}"])
|
||||
|
||||
# check that after gc everything is still there
|
||||
hundred_cur.execute('SELECT count(*) FROM foo')
|
||||
assert hundred_cur.fetchone() == (100, )
|
||||
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
try:
|
||||
zenith_cli.run(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
|
||||
except subprocess.CalledProcessError:
|
||||
log.info("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
|
||||
@@ -3,8 +3,11 @@ import os
|
||||
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -12,23 +15,19 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test compute node start after clog truncation
|
||||
#
|
||||
def test_clog_truncate(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
def test_clog_truncate(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_clog_truncate", "empty"])
|
||||
zenith_cli.run(["branch", "test_clog_truncate", "empty"])
|
||||
|
||||
# set agressive autovacuum to make sure that truncation will happen
|
||||
config = [
|
||||
'autovacuum_max_workers=10',
|
||||
'autovacuum_vacuum_threshold=0',
|
||||
'autovacuum_vacuum_insert_threshold=0',
|
||||
'autovacuum_vacuum_cost_delay=0',
|
||||
'autovacuum_vacuum_cost_limit=10000',
|
||||
'autovacuum_naptime =1s',
|
||||
'autovacuum_max_workers=10', 'autovacuum_vacuum_threshold=0',
|
||||
'autovacuum_vacuum_insert_threshold=0', 'autovacuum_vacuum_cost_delay=0',
|
||||
'autovacuum_vacuum_cost_limit=10000', 'autovacuum_naptime =1s',
|
||||
'autovacuum_freeze_max_age=100000'
|
||||
]
|
||||
|
||||
pg = env.postgres.create_start('test_clog_truncate', config_lines=config)
|
||||
pg = postgres.create_start('test_clog_truncate', config_lines=config)
|
||||
log.info('postgres is running on test_clog_truncate branch')
|
||||
|
||||
# Install extension containing function needed for test
|
||||
@@ -65,13 +64,13 @@ def test_clog_truncate(zenith_simple_env: ZenithEnv):
|
||||
|
||||
# create new branch after clog truncation and start a compute node on it
|
||||
log.info(f'create branch at lsn_after_truncation {lsn_after_truncation}')
|
||||
env.zenith_cli(
|
||||
zenith_cli.run(
|
||||
["branch", "test_clog_truncate_new", "test_clog_truncate@" + lsn_after_truncation])
|
||||
|
||||
pg2 = env.postgres.create_start('test_clog_truncate_new')
|
||||
pg2 = postgres.create_start('test_clog_truncate_new')
|
||||
log.info('postgres is running on test_clog_truncate_new branch')
|
||||
|
||||
# check that new node doesn't contain truncated segment
|
||||
pg_xact_0000_path_new = os.path.join(pg2.pg_xact_dir_path(), '0000')
|
||||
log.info(f"pg_xact_0000_path_new = {pg_xact_0000_path_new}")
|
||||
log.info("pg_xact_0000_path_new = " + pg_xact_0000_path_new)
|
||||
assert os.path.isfile(pg_xact_0000_path_new) is False
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -9,13 +12,12 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test starting Postgres with custom options
|
||||
#
|
||||
def test_config(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
def test_config(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_config", "empty"])
|
||||
zenith_cli.run(["branch", "test_config", "empty"])
|
||||
|
||||
# change config
|
||||
pg = env.postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
|
||||
pg = postgres.create_start('test_config', config_lines=['log_min_messages=debug1'])
|
||||
log.info('postgres is running on test_config branch')
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
|
||||
@@ -2,8 +2,11 @@ import os
|
||||
import pathlib
|
||||
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, PostgresFactory, ZenithCli, check_restored_datadir_content
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -11,11 +14,15 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test CREATE DATABASE when there have been relmapper changes
|
||||
#
|
||||
def test_createdb(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
env.zenith_cli(["branch", "test_createdb", "empty"])
|
||||
def test_createdb(
|
||||
zenith_cli: ZenithCli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
):
|
||||
zenith_cli.run(["branch", "test_createdb", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_createdb')
|
||||
pg = postgres.create_start('test_createdb')
|
||||
log.info("postgres is running on 'test_createdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
@@ -29,23 +36,27 @@ def test_createdb(zenith_simple_env: ZenithEnv):
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
# Create a branch
|
||||
env.zenith_cli(["branch", "test_createdb2", "test_createdb@" + lsn])
|
||||
zenith_cli.run(["branch", "test_createdb2", "test_createdb@" + lsn])
|
||||
|
||||
pg2 = env.postgres.create_start('test_createdb2')
|
||||
pg2 = postgres.create_start('test_createdb2')
|
||||
|
||||
# Test that you can connect to the new database on both branches
|
||||
for db in (pg, pg2):
|
||||
db.connect(dbname='foodb').close()
|
||||
|
||||
|
||||
#
|
||||
# Test DROP DATABASE
|
||||
#
|
||||
def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
|
||||
env = zenith_simple_env
|
||||
env.zenith_cli(["branch", "test_dropdb", "empty"])
|
||||
def test_dropdb(
|
||||
zenith_cli: ZenithCli,
|
||||
pageserver: ZenithPageserver,
|
||||
postgres: PostgresFactory,
|
||||
pg_bin,
|
||||
test_output_dir
|
||||
):
|
||||
zenith_cli.run(["branch", "test_dropdb", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_dropdb')
|
||||
pg = postgres.create_start('test_dropdb')
|
||||
log.info("postgres is running on 'test_dropdb' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
@@ -58,6 +69,7 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
|
||||
cur.execute("SELECT oid FROM pg_database WHERE datname='foodb';")
|
||||
dboid = cur.fetchone()[0]
|
||||
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('DROP DATABASE foodb')
|
||||
@@ -67,29 +79,28 @@ def test_dropdb(zenith_simple_env: ZenithEnv, test_output_dir):
|
||||
cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_after_drop = cur.fetchone()[0]
|
||||
|
||||
# Create two branches before and after database drop.
|
||||
env.zenith_cli(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
|
||||
pg_before = env.postgres.create_start('test_before_dropdb')
|
||||
|
||||
env.zenith_cli(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
|
||||
pg_after = env.postgres.create_start('test_after_dropdb')
|
||||
# Create two branches before and after database drop.
|
||||
zenith_cli.run(["branch", "test_before_dropdb", "test_dropdb@" + lsn_before_drop])
|
||||
pg_before = postgres.create_start('test_before_dropdb')
|
||||
|
||||
zenith_cli.run(["branch", "test_after_dropdb", "test_dropdb@" + lsn_after_drop])
|
||||
pg_after = postgres.create_start('test_after_dropdb')
|
||||
|
||||
# Test that database exists on the branch before drop
|
||||
pg_before.connect(dbname='foodb').close()
|
||||
|
||||
# Test that database subdir exists on the branch before drop
|
||||
assert pg_before.pgdata_dir
|
||||
dbpath = pathlib.Path(pg_before.pgdata_dir) / 'base' / str(dboid)
|
||||
log.info(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == True
|
||||
|
||||
# Test that database subdir doesn't exist on the branch after drop
|
||||
assert pg_after.pgdata_dir
|
||||
dbpath = pathlib.Path(pg_after.pgdata_dir) / 'base' / str(dboid)
|
||||
log.info(dbpath)
|
||||
|
||||
assert os.path.isdir(dbpath) == False
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(test_output_dir, env, pg)
|
||||
check_restored_datadir_content(zenith_cli, test_output_dir, pg, pageserver.service_port.pg)
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -9,11 +12,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
#
|
||||
# Test CREATE USER to check shared catalog restore
|
||||
#
|
||||
def test_createuser(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
env.zenith_cli(["branch", "test_createuser", "empty"])
|
||||
def test_createuser(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
zenith_cli.run(["branch", "test_createuser", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_createuser')
|
||||
pg = postgres.create_start('test_createuser')
|
||||
log.info("postgres is running on 'test_createuser' branch")
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
@@ -27,9 +29,9 @@ def test_createuser(zenith_simple_env: ZenithEnv):
|
||||
lsn = cur.fetchone()[0]
|
||||
|
||||
# Create a branch
|
||||
env.zenith_cli(["branch", "test_createuser2", "test_createuser@" + lsn])
|
||||
zenith_cli.run(["branch", "test_createuser2", "test_createuser@" + lsn])
|
||||
|
||||
pg2 = env.postgres.create_start('test_createuser2')
|
||||
pg2 = postgres.create_start('test_createuser2')
|
||||
|
||||
# Test that you can connect to new branch as a new user
|
||||
assert pg2.safe_psql('select current_user', username='testuser') == [('testuser', )]
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
from fixtures.zenith_fixtures import ZenithEnv, check_restored_datadir_content
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver, check_restored_datadir_content
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -10,11 +13,11 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
# it only checks next_multixact_id field in restored pg_control,
|
||||
# since we don't have functions to check multixact internals.
|
||||
#
|
||||
def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
|
||||
env = zenith_simple_env
|
||||
def test_multixact(pageserver: ZenithPageserver, postgres: PostgresFactory,
|
||||
pg_bin, zenith_cli, base_dir, test_output_dir):
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_multixact", "empty"])
|
||||
pg = env.postgres.create_start('test_multixact')
|
||||
zenith_cli.run(["branch", "test_multixact", "empty"])
|
||||
pg = postgres.create_start('test_multixact')
|
||||
|
||||
log.info("postgres is running on 'test_multixact' branch")
|
||||
pg_conn = pg.connect()
|
||||
@@ -53,8 +56,8 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
|
||||
assert int(next_multixact_id) > int(next_multixact_id_old)
|
||||
|
||||
# Branch at this point
|
||||
env.zenith_cli(["branch", "test_multixact_new", "test_multixact@" + lsn])
|
||||
pg_new = env.postgres.create_start('test_multixact_new')
|
||||
zenith_cli.run(["branch", "test_multixact_new", "test_multixact@" + lsn])
|
||||
pg_new = postgres.create_start('test_multixact_new')
|
||||
|
||||
log.info("postgres is running on 'test_multixact_new' branch")
|
||||
pg_new_conn = pg_new.connect()
|
||||
@@ -67,4 +70,4 @@ def test_multixact(zenith_simple_env: ZenithEnv, test_output_dir):
|
||||
assert next_multixact_id_new == next_multixact_id
|
||||
|
||||
# Check that we restore the content of the datadir correctly
|
||||
check_restored_datadir_content(test_output_dir, env, pg_new)
|
||||
check_restored_datadir_content(zenith_cli, test_output_dir, pg_new, pageserver.service_port.pg)
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import PostgresFactory, ZenithPageserver
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
#
|
||||
# Test where Postgres generates a lot of WAL, and it's garbage collected away, but
|
||||
# no pages are evicted so that Postgres uses an old LSN in a GetPage request.
|
||||
@@ -16,11 +18,10 @@ pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
# just a hint that the page hasn't been modified since that LSN, and the page
|
||||
# server should return the latest page version regardless of the LSN.
|
||||
#
|
||||
def test_old_request_lsn(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
def test_old_request_lsn(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, pg_bin):
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_old_request_lsn", "empty"])
|
||||
pg = env.postgres.create_start('test_old_request_lsn')
|
||||
zenith_cli.run(["branch", "test_old_request_lsn", "empty"])
|
||||
pg = postgres.create_start('test_old_request_lsn')
|
||||
log.info('postgres is running on test_old_request_lsn branch')
|
||||
|
||||
pg_conn = pg.connect()
|
||||
@@ -30,7 +31,7 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
psconn = env.pageserver.connect()
|
||||
psconn = pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Create table, and insert some rows. Make it big enough that it doesn't fit in
|
||||
@@ -49,20 +50,20 @@ def test_old_request_lsn(zenith_simple_env: ZenithEnv):
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
log.info(f'shared_buffers is {row[0]}, table size {row[1]}')
|
||||
log.info(f'shared_buffers is {row[0]}, table size {row[1]}');
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
cur.execute('VACUUM foo')
|
||||
cur.execute('VACUUM foo');
|
||||
|
||||
# Make a lot of updates on a single row, generating a lot of WAL. Trigger
|
||||
# garbage collections so that the page server will remove old page versions.
|
||||
for i in range(10):
|
||||
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
|
||||
pscur.execute(f"do_gc {pageserver.initial_tenant} {timeline} 0")
|
||||
for j in range(100):
|
||||
cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;')
|
||||
cur.execute('UPDATE foo SET val = val + 1 WHERE id = 1;');
|
||||
|
||||
# All (or at least most of) the updates should've been on the same page, so
|
||||
# that we haven't had to evict any dirty pages for a long time. Now run
|
||||
# a query that sends GetPage@LSN requests with the old LSN.
|
||||
cur.execute("SELECT COUNT(*), SUM(val) FROM foo")
|
||||
cur.execute("SELECT COUNT(*), SUM(val) FROM foo");
|
||||
assert cur.fetchone() == (100000, 101000)
|
||||
|
||||
@@ -3,28 +3,25 @@ from uuid import uuid4
|
||||
import pytest
|
||||
import psycopg2
|
||||
import requests
|
||||
from fixtures.zenith_fixtures import ZenithEnv, ZenithEnvBuilder, ZenithPageserverHttpClient
|
||||
from typing import cast
|
||||
from fixtures.zenith_fixtures import ZenithPageserver, ZenithPageserverHttpClient
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def test_status_psql(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
assert env.pageserver.safe_psql('status') == [
|
||||
def test_status_psql(pageserver):
|
||||
assert pageserver.safe_psql('status') == [
|
||||
('hello world', ),
|
||||
]
|
||||
|
||||
|
||||
def test_branch_list_psql(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
def test_branch_list_psql(pageserver: ZenithPageserver, zenith_cli):
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_branch_list_main", "empty"])
|
||||
zenith_cli.run(["branch", "test_branch_list_main", "empty"])
|
||||
|
||||
conn = env.pageserver.connect()
|
||||
conn = pageserver.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute(f'branch_list {env.initial_tenant}')
|
||||
cur.execute(f'branch_list {pageserver.initial_tenant}')
|
||||
branches = json.loads(cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
branches = [x for x in branches if x['name'].startswith('test_branch_list')]
|
||||
@@ -37,10 +34,10 @@ def test_branch_list_psql(zenith_simple_env: ZenithEnv):
|
||||
assert 'ancestor_lsn' in branches[0]
|
||||
|
||||
# Create another branch, and start Postgres on it
|
||||
env.zenith_cli(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
|
||||
env.zenith_cli(['pg', 'create', 'test_branch_list_experimental'])
|
||||
zenith_cli.run(['branch', 'test_branch_list_experimental', 'test_branch_list_main'])
|
||||
zenith_cli.run(['pg', 'create', 'test_branch_list_experimental'])
|
||||
|
||||
cur.execute(f'branch_list {env.initial_tenant}')
|
||||
cur.execute(f'branch_list {pageserver.initial_tenant}')
|
||||
new_branches = json.loads(cur.fetchone()[0])
|
||||
# Filter out branches created by other tests
|
||||
new_branches = [x for x in new_branches if x['name'].startswith('test_branch_list')]
|
||||
@@ -56,22 +53,18 @@ def test_branch_list_psql(zenith_simple_env: ZenithEnv):
|
||||
conn.close()
|
||||
|
||||
|
||||
def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
|
||||
# don't use zenith_simple_env, because there might be other tenants there,
|
||||
# left over from other tests.
|
||||
env = zenith_env_builder.init()
|
||||
|
||||
res = env.zenith_cli(["tenant", "list"])
|
||||
def test_tenant_list_psql(pageserver: ZenithPageserver, zenith_cli):
|
||||
res = zenith_cli.run(["tenant", "list"])
|
||||
res.check_returncode()
|
||||
tenants = sorted(map(lambda t: t.split()[0], res.stdout.splitlines()))
|
||||
assert tenants == [env.initial_tenant]
|
||||
tenants = res.stdout.splitlines()
|
||||
assert tenants == [pageserver.initial_tenant]
|
||||
|
||||
conn = env.pageserver.connect()
|
||||
conn = pageserver.connect()
|
||||
cur = conn.cursor()
|
||||
|
||||
# check same tenant cannot be created twice
|
||||
with pytest.raises(psycopg2.DatabaseError, match=f'tenant {env.initial_tenant} already exists'):
|
||||
cur.execute(f'tenant_create {env.initial_tenant}')
|
||||
with pytest.raises(psycopg2.DatabaseError, match=f'tenant {pageserver.initial_tenant} already exists'):
|
||||
cur.execute(f'tenant_create {pageserver.initial_tenant}')
|
||||
|
||||
# create one more tenant
|
||||
tenant1 = uuid4().hex
|
||||
@@ -80,20 +73,20 @@ def test_tenant_list_psql(zenith_env_builder: ZenithEnvBuilder):
|
||||
cur.execute('tenant_list')
|
||||
|
||||
# compare tenants list
|
||||
new_tenants = sorted(map(lambda t: cast(str, t['id']), json.loads(cur.fetchone()[0])))
|
||||
assert sorted([env.initial_tenant, tenant1]) == new_tenants
|
||||
new_tenants = sorted(json.loads(cur.fetchone()[0]))
|
||||
assert sorted([pageserver.initial_tenant, tenant1]) == new_tenants
|
||||
|
||||
|
||||
def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
|
||||
client.check_status()
|
||||
|
||||
# check initial tenant is there
|
||||
assert initial_tenant in {t['id'] for t in client.tenant_list()}
|
||||
assert initial_tenant in set(client.tenant_list())
|
||||
|
||||
# create new tenant and check it is also there
|
||||
tenant_id = uuid4()
|
||||
client.tenant_create(tenant_id)
|
||||
assert tenant_id.hex in {t['id'] for t in client.tenant_list()}
|
||||
assert tenant_id.hex in set(client.tenant_list())
|
||||
|
||||
# create branch
|
||||
branch_name = uuid4().hex
|
||||
@@ -103,17 +96,11 @@ def check_client(client: ZenithPageserverHttpClient, initial_tenant: str):
|
||||
assert branch_name in {b['name'] for b in client.branch_list(tenant_id)}
|
||||
|
||||
|
||||
def test_pageserver_http_api_client(zenith_simple_env: ZenithEnv):
|
||||
env = zenith_simple_env
|
||||
client = env.pageserver.http_client()
|
||||
check_client(client, env.initial_tenant)
|
||||
def test_pageserver_http_api_client(pageserver: ZenithPageserver):
|
||||
client = pageserver.http_client()
|
||||
check_client(client, pageserver.initial_tenant)
|
||||
|
||||
|
||||
def test_pageserver_http_api_client_auth_enabled(zenith_env_builder: ZenithEnvBuilder):
|
||||
zenith_env_builder.pageserver_auth_enabled = True
|
||||
env = zenith_env_builder.init()
|
||||
|
||||
management_token = env.auth_keys.generate_management_token()
|
||||
|
||||
client = env.pageserver.http_client(auth_token=management_token)
|
||||
check_client(client, env.initial_tenant)
|
||||
def test_pageserver_http_api_client_auth_enabled(pageserver_auth_enabled: ZenithPageserver):
|
||||
client = pageserver_auth_enabled.http_client(auth_token=pageserver_auth_enabled.auth_keys.generate_management_token())
|
||||
check_client(client, pageserver_auth_enabled.initial_tenant)
|
||||
|
||||
@@ -4,22 +4,25 @@ import time
|
||||
|
||||
from contextlib import closing
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.zenith_fixtures import WalAcceptorFactory, ZenithPageserver, PostgresFactory
|
||||
|
||||
import logging
|
||||
import fixtures.log_helper # configures loggers
|
||||
log = logging.getLogger('root')
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
||||
# times, with fault_probability chance of getting a wal acceptor down or up
|
||||
# along the way. 2 of 3 are always alive, so the work keeps going.
|
||||
def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
|
||||
# One safekeeper is enough for this test.
|
||||
zenith_env_builder.num_safekeepers = 1
|
||||
env = zenith_env_builder.init()
|
||||
def test_pageserver_restart(zenith_cli, pageserver: ZenithPageserver, postgres: PostgresFactory, wa_factory: WalAcceptorFactory):
|
||||
|
||||
env.zenith_cli(["branch", "test_pageserver_restart", "main"])
|
||||
pg = env.postgres.create_start('test_pageserver_restart')
|
||||
# One safekeeper is enough for this test.
|
||||
wa_factory.start_n_new(1)
|
||||
|
||||
zenith_cli.run(["branch", "test_pageserver_restart", "empty"])
|
||||
pg = postgres.create_start('test_pageserver_restart',
|
||||
wal_acceptors=wa_factory.get_connstrs())
|
||||
|
||||
pg_conn = pg.connect()
|
||||
cur = pg_conn.cursor()
|
||||
@@ -41,14 +44,14 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}");
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
# Stop and restart pageserver. This is a more or less graceful shutdown, although
|
||||
# the page server doesn't currently have a shutdown routine so there's no difference
|
||||
# between stopping and crashing.
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
pageserver.stop();
|
||||
pageserver.start();
|
||||
|
||||
# Stopping the pageserver breaks the connection from the postgres backend to
|
||||
# the page server, and causes the next query on the connection to fail. Start a new
|
||||
@@ -62,5 +65,6 @@ def test_pageserver_restart(zenith_env_builder: ZenithEnvBuilder):
|
||||
assert cur.fetchone() == (100000, )
|
||||
|
||||
# Stop the page server by force, and restart it
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
pageserver.stop();
|
||||
pageserver.start();
|
||||
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
from io import BytesIO
|
||||
import asyncio
|
||||
import asyncpg
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import ZenithEnv, Postgres
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
async def repeat_bytes(buf, repetitions: int):
|
||||
for i in range(repetitions):
|
||||
yield buf
|
||||
|
||||
|
||||
async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str):
|
||||
buf = BytesIO()
|
||||
for i in range(1000):
|
||||
buf.write(
|
||||
f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode())
|
||||
buf.seek(0)
|
||||
|
||||
copy_input = repeat_bytes(buf.read(), 5000)
|
||||
|
||||
pg_conn = await pg.connect_async()
|
||||
await pg_conn.copy_to_table(table_name, source=copy_input)
|
||||
|
||||
|
||||
async def parallel_load_same_table(pg: Postgres, n_parallel: int):
|
||||
workers = []
|
||||
for worker_id in range(n_parallel):
|
||||
worker = copy_test_data_to_table(pg, worker_id, f'copytest')
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
# await all workers
|
||||
await asyncio.gather(*workers)
|
||||
|
||||
|
||||
# Load data into one table with COPY TO from 5 parallel connections
|
||||
def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5):
|
||||
env = zenith_simple_env
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_parallel_copy", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_parallel_copy')
|
||||
log.info("postgres is running on 'test_parallel_copy' branch")
|
||||
|
||||
# Create test table
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute(f'CREATE TABLE copytest (i int, t text)')
|
||||
|
||||
# Run COPY TO to load the table with parallel connections.
|
||||
asyncio.run(parallel_load_same_table(pg, n_parallel))
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user