mirror of
https://github.com/neondatabase/neon.git
synced 2026-04-26 02:40:38 +00:00
Compare commits
1 Commits
parallel_w
...
layer-fsyn
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
09e29c3e54 |
@@ -4,16 +4,17 @@ executors:
|
||||
zenith-build-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
- image: cimg/rust:1.56.1
|
||||
- image: cimg/rust:1.55.0
|
||||
zenith-python-executor:
|
||||
docker:
|
||||
- image: cimg/python:3.7.10 # Oldest available 3.7 with Ubuntu 20.04 (for GLIBC and Rust) at CirlceCI
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
check-codestyle:
|
||||
executor: zenith-build-executor
|
||||
steps:
|
||||
- checkout
|
||||
|
||||
- run:
|
||||
name: rustfmt
|
||||
when: always
|
||||
@@ -80,8 +81,6 @@ jobs:
|
||||
build_type:
|
||||
type: enum
|
||||
enum: ["debug", "release"]
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
- run:
|
||||
name: apt install dependencies
|
||||
@@ -117,16 +116,15 @@ jobs:
|
||||
- run:
|
||||
name: Rust build << parameters.build_type >>
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
export CARGO_INCREMENTAL=0
|
||||
"${cov_prefix[@]}" cargo build $CARGO_FLAGS --bins --tests
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
echo "Build in debug mode"
|
||||
cargo build --bins --tests
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
echo "Build in release mode"
|
||||
cargo build --release --bins --tests
|
||||
fi
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
@@ -140,96 +138,67 @@ jobs:
|
||||
# has to run separately from cargo fmt section
|
||||
# since needs to run with dependencies
|
||||
- run:
|
||||
name: cargo clippy
|
||||
name: clippy
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" ./run_clippy.sh
|
||||
./run_clippy.sh
|
||||
|
||||
# Run rust unit tests
|
||||
- run:
|
||||
name: cargo test
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" cargo test
|
||||
- run: cargo test
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
# `--locked` is required; otherwise, `cargo install` will ignore Cargo.lock.
|
||||
# FIXME: this is a really silly way to install; maybe we should just output
|
||||
# a tarball as an artifact? Or a .deb package?
|
||||
- run:
|
||||
name: Install rust binaries
|
||||
name: cargo install
|
||||
command: |
|
||||
export CARGO_INCREMENTAL=0
|
||||
BUILD_TYPE="<< parameters.build_type >>"
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
echo "Install debug mode"
|
||||
CARGO_FLAGS="--debug"
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
binaries=$(
|
||||
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
|
||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||
)
|
||||
|
||||
test_exe_paths=$(
|
||||
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
|
||||
mkdir -p /tmp/zenith/bin
|
||||
mkdir -p /tmp/zenith/test_bin
|
||||
mkdir -p /tmp/zenith/etc
|
||||
|
||||
# Install target binaries
|
||||
for bin in $binaries; do
|
||||
SRC=target/$BUILD_TYPE/$bin
|
||||
DST=/tmp/zenith/bin/$bin
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
||||
done
|
||||
|
||||
# Install test executables (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
for bin in $test_exe_paths; do
|
||||
SRC=$bin
|
||||
DST=/tmp/zenith/test_bin/$(basename $bin)
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
||||
done
|
||||
echo "Install release mode"
|
||||
# The default is release mode; there is no --release flag.
|
||||
CARGO_FLAGS=""
|
||||
fi
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path pageserver
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path walkeeper
|
||||
cargo install $CARGO_FLAGS --locked --root /tmp/zenith --path zenith
|
||||
|
||||
# Install the postgres binaries, for use by test jobs
|
||||
# FIXME: this is a silly way to do "install"; maybe just output a standard
|
||||
# postgres package, whatever the favored form is (tarball? .deb package?)
|
||||
# Note that pg_regress needs some build artifacts that probably aren't
|
||||
# in the usual package...?
|
||||
- run:
|
||||
name: Install postgres binaries
|
||||
name: postgres install
|
||||
command: |
|
||||
cp -a tmp_install /tmp/zenith/pg_install
|
||||
|
||||
# Save the rust binaries and coverage data for other jobs in this workflow.
|
||||
# Save the rust output binaries for other jobs in this workflow.
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
check-codestyle-python:
|
||||
check-python:
|
||||
executor: zenith-python-executor
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Install deps
|
||||
working_directory: test_runner
|
||||
command: pipenv --python 3.7 install --dev
|
||||
- run:
|
||||
name: Run yapf to ensure code format
|
||||
when: always
|
||||
working_directory: test_runner
|
||||
command: pipenv run yapf --recursive --diff .
|
||||
- run:
|
||||
name: Run mypy to check types
|
||||
when: always
|
||||
working_directory: test_runner
|
||||
command: pipenv run mypy .
|
||||
|
||||
run-pytest:
|
||||
@@ -259,11 +228,6 @@ jobs:
|
||||
run_in_parallel:
|
||||
type: boolean
|
||||
default: true
|
||||
save_perf_report:
|
||||
type: boolean
|
||||
default: false
|
||||
environment:
|
||||
BUILD_TYPE: << parameters.build_type >>
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
@@ -274,25 +238,17 @@ jobs:
|
||||
- run: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Install deps
|
||||
working_directory: test_runner
|
||||
command: pipenv --python 3.7 install
|
||||
- run:
|
||||
name: Run pytest
|
||||
# pytest doesn't output test logs in real time, so CI job may fail with
|
||||
# `Too long with no output` error, if a test is running for a long time.
|
||||
# In that case, tests should have internal timeouts that are less than
|
||||
# no_output_timeout, specified here.
|
||||
no_output_timeout: 10m
|
||||
working_directory: test_runner
|
||||
environment:
|
||||
- ZENITH_BIN: /tmp/zenith/bin
|
||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||
- TEST_OUTPUT: /tmp/test_output
|
||||
# this variable will be embedded in perf test report
|
||||
# and is needed to distinguish different environments
|
||||
- PLATFORM: zenith-local-ci
|
||||
command: |
|
||||
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
|
||||
|
||||
TEST_SELECTION="test_runner/<< parameters.test_selection >>"
|
||||
TEST_SELECTION="<< parameters.test_selection >>"
|
||||
EXTRA_PARAMS="<< parameters.extra_params >>"
|
||||
if [ -z "$TEST_SELECTION" ]; then
|
||||
echo "test_selection must be set"
|
||||
@@ -300,22 +256,7 @@ jobs:
|
||||
fi
|
||||
if << parameters.run_in_parallel >>; then
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
fi
|
||||
if << parameters.save_perf_report >>; then
|
||||
if [[ $CIRCLE_BRANCH == "main" ]]; then
|
||||
mkdir -p "$PERF_REPORT_DIR"
|
||||
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||
fi
|
||||
fi
|
||||
|
||||
export GITHUB_SHA=$CIRCLE_SHA1
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
fi;
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
@@ -326,21 +267,7 @@ jobs:
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
"${cov_prefix[@]}" pipenv run pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
-m "not remote_cluster" \
|
||||
-rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
|
||||
if << parameters.save_perf_report >>; then
|
||||
if [[ $CIRCLE_BRANCH == "main" ]]; then
|
||||
# TODO: reuse scripts/git-upload
|
||||
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||
export REPORT_TO=local
|
||||
scripts/generate_and_push_perf_report.sh
|
||||
fi
|
||||
fi
|
||||
pipenv run pytest --junitxml=$TEST_OUTPUT/junit.xml --tb=short --verbose -rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
- run:
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
@@ -356,65 +283,6 @@ jobs:
|
||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||
- store_test_results:
|
||||
path: /tmp/test_output
|
||||
# Save coverage data (if any)
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
coverage-report:
|
||||
executor: zenith-build-executor
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
- checkout
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
keys:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
|
||||
- run:
|
||||
name: Install llvm-tools
|
||||
command: |
|
||||
# TODO: install a proper symbol demangler, e.g. rustfilt
|
||||
# TODO: we should embed this into a docker image
|
||||
rustup component add llvm-tools-preview
|
||||
- run:
|
||||
name: Build coverage report
|
||||
command: |
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/coverage \
|
||||
--dir=/tmp/zenith/coverage report \
|
||||
--input-objects=/tmp/zenith/etc/binaries.list \
|
||||
--commit-url=$COMMIT_URL \
|
||||
--format=github
|
||||
- run:
|
||||
name: Upload coverage report
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
||||
|
||||
# Add link to the coverage report to the commit
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "$CI_ACCESS_TOKEN" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"success\",
|
||||
\"context\": \"zenith-coverage\",
|
||||
\"description\": \"Coverage report is ready\",
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
# Build zenithdb/zenith:latest image and push it to Docker hub
|
||||
docker-image:
|
||||
@@ -431,7 +299,7 @@ jobs:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build --build-arg GIT_VERSION=$CIRCLE_SHA1 -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
|
||||
docker build -t zenithdb/zenith:latest . && docker push zenithdb/zenith:latest
|
||||
|
||||
# Trigger a new remote CI job
|
||||
remote-ci-trigger:
|
||||
@@ -480,8 +348,8 @@ jobs:
|
||||
workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
- check-codestyle-rust
|
||||
- check-codestyle-python
|
||||
- check-codestyle
|
||||
- check-python
|
||||
- build-postgres:
|
||||
name: build-postgres-<< matrix.build_type >>
|
||||
matrix:
|
||||
@@ -516,15 +384,8 @@ workflows:
|
||||
build_type: release
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
requires:
|
||||
- build-zenith-release
|
||||
- coverage-report:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
requires:
|
||||
# TODO: consider adding more
|
||||
- other-tests-debug
|
||||
- docker-image:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
|
||||
114
.github/workflows/benchmarking.yml
vendored
114
.github/workflows/benchmarking.yml
vendored
@@ -1,114 +0,0 @@
|
||||
name: benchmarking
|
||||
|
||||
on:
|
||||
# uncomment to run on push for debugging your PR
|
||||
# push:
|
||||
# branches: [ mybranch ]
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '36 7 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
env:
|
||||
BASE_URL: "https://console.zenith.tech"
|
||||
|
||||
jobs:
|
||||
bench:
|
||||
# this workflow runs on self hosteed runner
|
||||
# it's environment is quite different from usual guthub runner
|
||||
# probably the most important difference is that it doesnt start from clean workspace each time
|
||||
# e g if you install system packages they are not cleaned up since you install them directly in host machine
|
||||
# not a container or something
|
||||
# See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
PG_BIN: "/usr/pgsql-13/bin"
|
||||
|
||||
steps:
|
||||
- name: Checkout zenith repo
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# actions/setup-python@v2 is not working correctly on self-hosted runners
|
||||
# see https://github.com/actions/setup-python/issues/162
|
||||
# and probably https://github.com/actions/setup-python/issues/162#issuecomment-865387976 in particular
|
||||
# so the simplest solution to me is to use already installed system python and spin virtualenvs for job runs.
|
||||
# there is Python 3.7.10 already installed on the machine so use it to install pipenv and then use pipenv's virtuealenvs
|
||||
- name: Install pipenv & deps
|
||||
run: |
|
||||
python3 -m pip install --upgrade pipenv wheel
|
||||
# since pip/pipenv caches are reused there shouldn't be any troubles with install every time
|
||||
pipenv install
|
||||
|
||||
- name: Show versions
|
||||
run: |
|
||||
echo Python
|
||||
python3 --version
|
||||
pipenv run python3 --version
|
||||
echo Pipenv
|
||||
pipenv --version
|
||||
echo Pgbench
|
||||
$PG_BIN/pgbench --version
|
||||
|
||||
# FIXME cluster setup is skipped due to various changes in console API
|
||||
# for now pre created cluster is used. When API gain some stability
|
||||
# after massive changes dynamic cluster setup will be revived.
|
||||
# So use pre created cluster. It needs to be started manually, but stop is automatic after 5 minutes of inactivity
|
||||
- name: Setup cluster
|
||||
env:
|
||||
BENCHMARK_CONSOLE_USER_PASSWORD: "${{ secrets.BENCHMARK_CONSOLE_USER_PASSWORD }}"
|
||||
BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
|
||||
BENCHMARK_CLUSTER_ID: "${{ secrets.BENCHMARK_CLUSTER_ID }}"
|
||||
shell: bash
|
||||
run: |
|
||||
set -e
|
||||
|
||||
echo "Starting cluster"
|
||||
CLUSTER=$(curl -s --fail --show-error -X POST $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID/start \
|
||||
-H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
|
||||
echo $CLUSTER | python -m json.tool
|
||||
|
||||
echo "Waiting for cluster to become ready"
|
||||
sleep 10
|
||||
|
||||
echo "CLUSTER_ID=$BENCHMARK_CLUSTER_ID" >> $GITHUB_ENV
|
||||
CLUSTER=$(curl -s --fail --show-error -X GET $BASE_URL/api/v1/clusters/$BENCHMARK_CLUSTER_ID.json \
|
||||
-H "Authorization: Bearer $BENCHMARK_CONSOLE_ACCESS_TOKEN")
|
||||
echo $CLUSTER | python -m json.tool
|
||||
|
||||
- name: Run benchmark
|
||||
# pgbench is installed system wide from official repo
|
||||
# https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
|
||||
# via
|
||||
# sudo tee /etc/yum.repos.d/pgdg.repo<<EOF
|
||||
# [pgdg13]
|
||||
# name=PostgreSQL 13 for RHEL/CentOS 7 - x86_64
|
||||
# baseurl=https://download.postgresql.org/pub/repos/yum/13/redhat/rhel-7-x86_64/
|
||||
# enabled=1
|
||||
# gpgcheck=0
|
||||
# EOF
|
||||
# sudo yum makecache
|
||||
# sudo yum install postgresql13-contrib
|
||||
# actual binaries are located in /usr/pgsql-13/bin/
|
||||
env:
|
||||
TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,15"
|
||||
PLATFORM: "zenith-staging"
|
||||
BENCHMARK_CONSOLE_ACCESS_TOKEN: "${{ secrets.BENCHMARK_CONSOLE_ACCESS_TOKEN }}"
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||
run: |
|
||||
mkdir -p perf-report-staging
|
||||
pipenv run pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
|
||||
|
||||
- name: Submit result
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
run: |
|
||||
REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -7,7 +7,3 @@ test_output/
|
||||
.vscode
|
||||
/.zenith
|
||||
/integration_tests/.zenith
|
||||
|
||||
# Coverage
|
||||
*.profraw
|
||||
*.profdata
|
||||
|
||||
10
.yapfignore
10
.yapfignore
@@ -1,10 +0,0 @@
|
||||
# This file is only read when `yapf` is run from this directory.
|
||||
# Hence we only top-level directories here to avoid confusion.
|
||||
# See source code for the exact file format: https://github.com/google/yapf/blob/c6077954245bc3add82dafd853a1c7305a6ebd20/yapf/yapflib/file_resources.py#L40-L43
|
||||
vendor/
|
||||
target/
|
||||
tmp_install/
|
||||
__pycache__/
|
||||
test_output/
|
||||
.zenith/
|
||||
.git/
|
||||
33
Cargo.lock
generated
33
Cargo.lock
generated
@@ -208,7 +208,8 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "bookfile"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/zenithdb/bookfile.git?branch=generic-readext#d51a99c7a0be48c3d9cc7cb85c9b7fb05ce1100c"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "efa3e2086414e1bbecbc10730f265e5b079ab4ea0b830e7219a70dab6471e753"
|
||||
dependencies = [
|
||||
"aversion",
|
||||
"byteorder",
|
||||
@@ -653,28 +654,6 @@ dependencies = [
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "git-version"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6b0decc02f4636b9ccad390dcbe77b722a77efedfa393caf8379a51d5c61899"
|
||||
dependencies = [
|
||||
"git-version-macro",
|
||||
"proc-macro-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "git-version-macro"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fe69f1cbdb6e28af2bac214e943b99ce8a0a06b447d15d3e61161b0423139f3f"
|
||||
dependencies = [
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "glob"
|
||||
version = "0.3.0"
|
||||
@@ -1216,8 +1195,6 @@ dependencies = [
|
||||
"hyper",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"nix",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"postgres-protocol",
|
||||
"postgres-types",
|
||||
@@ -1888,6 +1865,7 @@ version = "0.3.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c98891d737e271a2954825ef19e46bd16bdb98e2746f2eec4f7a4ef7946efd1"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"signal-hook-registry",
|
||||
]
|
||||
@@ -2359,8 +2337,6 @@ dependencies = [
|
||||
"rust-s3",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"walkdir",
|
||||
@@ -2597,13 +2573,11 @@ dependencies = [
|
||||
"bincode",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"git-version",
|
||||
"hex",
|
||||
"hex-literal",
|
||||
"hyper",
|
||||
"jsonwebtoken",
|
||||
"lazy_static",
|
||||
"nix",
|
||||
"postgres",
|
||||
"rand",
|
||||
"routerify",
|
||||
@@ -2611,7 +2585,6 @@ dependencies = [
|
||||
"rustls-split",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"signal-hook",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
|
||||
@@ -21,15 +21,11 @@ RUN rm -rf postgres_install/build
|
||||
# net time waste in a lot of cases. Copying Cargo.lock with empty lib.rs should do the work.
|
||||
#
|
||||
FROM zenithdb/build:buster AS build
|
||||
|
||||
ARG GIT_VERSION
|
||||
RUN if [ -z "$GIT_VERSION" ]; then echo "GIT_VERSION is reqired, use build_arg to pass it"; exit 1; fi
|
||||
|
||||
WORKDIR /zenith
|
||||
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
|
||||
COPY . .
|
||||
RUN GIT_VERSION=$GIT_VERSION cargo build --release
|
||||
RUN cargo build --release
|
||||
|
||||
#
|
||||
# Copy binaries to resulting image.
|
||||
|
||||
30
Pipfile
30
Pipfile
@@ -1,30 +0,0 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
pytest = ">=6.0.0"
|
||||
typing-extensions = "*"
|
||||
pyjwt = {extras = ["crypto"], version = "*"}
|
||||
requests = "*"
|
||||
pytest-xdist = "*"
|
||||
asyncpg = "*"
|
||||
cached-property = "*"
|
||||
psycopg2-binary = "*"
|
||||
jinja2 = "*"
|
||||
|
||||
[dev-packages]
|
||||
# Behavior may change slightly between versions. These are run continuously,
|
||||
# so we pin exact versions to avoid suprising breaks. Update if comfortable.
|
||||
yapf = "==0.31.0"
|
||||
mypy = "==0.910"
|
||||
# Non-pinned packages follow.
|
||||
pipenv = "*"
|
||||
flake8 = "*"
|
||||
types-requests = "*"
|
||||
types-psycopg2 = "*"
|
||||
|
||||
[requires]
|
||||
# we need at least 3.7, but pipenv doesn't allow to say this directly
|
||||
python_version = "3"
|
||||
652
Pipfile.lock
generated
652
Pipfile.lock
generated
@@ -1,652 +0,0 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "c309cb963a7b07ae3d30e9cbf08b495f77bdecc0e5356fc89d133c4fbcb65b2b"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"asyncpg": {
|
||||
"hashes": [
|
||||
"sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
|
||||
"sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
|
||||
"sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
|
||||
"sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
|
||||
"sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
|
||||
"sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
|
||||
"sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
|
||||
"sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
|
||||
"sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
|
||||
"sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
|
||||
"sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
|
||||
"sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
|
||||
"sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.24.0"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"cached-property": {
|
||||
"hashes": [
|
||||
"sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
|
||||
"sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.5.2"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
|
||||
"sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
|
||||
"sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
|
||||
"sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
|
||||
"sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
|
||||
"sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
|
||||
"sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
|
||||
"sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
|
||||
"sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
|
||||
"sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
|
||||
"sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
|
||||
"sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
|
||||
"sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
|
||||
"sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
|
||||
"sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
|
||||
"sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
|
||||
"sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
|
||||
"sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
|
||||
"sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
|
||||
"sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
|
||||
"sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
|
||||
"sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
|
||||
"sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
|
||||
"sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
|
||||
"sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
|
||||
"sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
|
||||
"sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
|
||||
"sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
|
||||
"sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
|
||||
"sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
|
||||
"sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
|
||||
"sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
|
||||
"sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
|
||||
"sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
|
||||
"sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
|
||||
"sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
|
||||
"sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
|
||||
"sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
|
||||
"sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
|
||||
"sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
|
||||
"sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
|
||||
"sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
|
||||
"sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
|
||||
"sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
|
||||
"sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
|
||||
"sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
|
||||
"sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
|
||||
"sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
|
||||
"sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
|
||||
"sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
|
||||
],
|
||||
"version": "==1.15.0"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
|
||||
"sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.7"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
|
||||
"sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
|
||||
"sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
|
||||
"sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
|
||||
"sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
|
||||
"sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
|
||||
"sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
|
||||
"sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
|
||||
"sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
|
||||
"sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
|
||||
"sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
|
||||
"sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
|
||||
"sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
|
||||
"sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
|
||||
"sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
|
||||
"sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
|
||||
"sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
|
||||
"sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
|
||||
"sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
|
||||
"sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
|
||||
],
|
||||
"version": "==35.0.0"
|
||||
},
|
||||
"execnet": {
|
||||
"hashes": [
|
||||
"sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
|
||||
"sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.9.0"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
|
||||
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.8.1"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"jinja2": {
|
||||
"hashes": [
|
||||
"sha256:827a0e32839ab1600d4eb1c4c33ec5a8edfbc5cb42dafa13b81f182f97784b45",
|
||||
"sha256:8569982d3f0889eed11dd620c706d39b60c36d6d25843961f33f77fb6bc6b20c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.0.2"
|
||||
},
|
||||
"markupsafe": {
|
||||
"hashes": [
|
||||
"sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298",
|
||||
"sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64",
|
||||
"sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b",
|
||||
"sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194",
|
||||
"sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567",
|
||||
"sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff",
|
||||
"sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724",
|
||||
"sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74",
|
||||
"sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646",
|
||||
"sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35",
|
||||
"sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6",
|
||||
"sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a",
|
||||
"sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6",
|
||||
"sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad",
|
||||
"sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26",
|
||||
"sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38",
|
||||
"sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac",
|
||||
"sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7",
|
||||
"sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6",
|
||||
"sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047",
|
||||
"sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75",
|
||||
"sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f",
|
||||
"sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b",
|
||||
"sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135",
|
||||
"sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8",
|
||||
"sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a",
|
||||
"sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a",
|
||||
"sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1",
|
||||
"sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9",
|
||||
"sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864",
|
||||
"sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914",
|
||||
"sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee",
|
||||
"sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f",
|
||||
"sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18",
|
||||
"sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8",
|
||||
"sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2",
|
||||
"sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d",
|
||||
"sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b",
|
||||
"sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b",
|
||||
"sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86",
|
||||
"sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6",
|
||||
"sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f",
|
||||
"sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb",
|
||||
"sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833",
|
||||
"sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28",
|
||||
"sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e",
|
||||
"sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415",
|
||||
"sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902",
|
||||
"sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f",
|
||||
"sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d",
|
||||
"sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9",
|
||||
"sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d",
|
||||
"sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145",
|
||||
"sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066",
|
||||
"sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c",
|
||||
"sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1",
|
||||
"sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a",
|
||||
"sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207",
|
||||
"sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f",
|
||||
"sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53",
|
||||
"sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd",
|
||||
"sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134",
|
||||
"sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85",
|
||||
"sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9",
|
||||
"sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5",
|
||||
"sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94",
|
||||
"sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509",
|
||||
"sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51",
|
||||
"sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.0.1"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:096d689d78ca690e4cd8a89568ba06d07ca097e3306a4381635073ca91479966",
|
||||
"sha256:14317396d1e8cdb122989b916fa2c7e9ca8e2be9e8060a6eff75b6b7b4d8a7e0"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==21.2"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"psycopg2-binary": {
|
||||
"hashes": [
|
||||
"sha256:0b7dae87f0b729922e06f85f667de7bf16455d411971b2043bbd9577af9d1975",
|
||||
"sha256:0f2e04bd2a2ab54fa44ee67fe2d002bb90cee1c0f1cc0ebc3148af7b02034cbd",
|
||||
"sha256:123c3fb684e9abfc47218d3784c7b4c47c8587951ea4dd5bc38b6636ac57f616",
|
||||
"sha256:1473c0215b0613dd938db54a653f68251a45a78b05f6fc21af4326f40e8360a2",
|
||||
"sha256:14db1752acdd2187d99cb2ca0a1a6dfe57fc65c3281e0f20e597aac8d2a5bd90",
|
||||
"sha256:1e3a362790edc0a365385b1ac4cc0acc429a0c0d662d829a50b6ce743ae61b5a",
|
||||
"sha256:1e85b74cbbb3056e3656f1cc4781294df03383127a8114cbc6531e8b8367bf1e",
|
||||
"sha256:20f1ab44d8c352074e2d7ca67dc00843067788791be373e67a0911998787ce7d",
|
||||
"sha256:24b0b6688b9f31a911f2361fe818492650795c9e5d3a1bc647acbd7440142a4f",
|
||||
"sha256:2f62c207d1740b0bde5c4e949f857b044818f734a3d57f1d0d0edc65050532ed",
|
||||
"sha256:3242b9619de955ab44581a03a64bdd7d5e470cc4183e8fcadd85ab9d3756ce7a",
|
||||
"sha256:35c4310f8febe41f442d3c65066ca93cccefd75013df3d8c736c5b93ec288140",
|
||||
"sha256:4235f9d5ddcab0b8dbd723dca56ea2922b485ea00e1dafacf33b0c7e840b3d32",
|
||||
"sha256:542875f62bc56e91c6eac05a0deadeae20e1730be4c6334d8f04c944fcd99759",
|
||||
"sha256:5ced67f1e34e1a450cdb48eb53ca73b60aa0af21c46b9b35ac3e581cf9f00e31",
|
||||
"sha256:661509f51531ec125e52357a489ea3806640d0ca37d9dada461ffc69ee1e7b6e",
|
||||
"sha256:7360647ea04db2e7dff1648d1da825c8cf68dc5fbd80b8fb5b3ee9f068dcd21a",
|
||||
"sha256:736b8797b58febabb85494142c627bd182b50d2a7ec65322983e71065ad3034c",
|
||||
"sha256:8c13d72ed6af7fd2c8acbd95661cf9477f94e381fce0792c04981a8283b52917",
|
||||
"sha256:988b47ac70d204aed01589ed342303da7c4d84b56c2f4c4b8b00deda123372bf",
|
||||
"sha256:995fc41ebda5a7a663a254a1dcac52638c3e847f48307b5416ee373da15075d7",
|
||||
"sha256:a36c7eb6152ba5467fb264d73844877be8b0847874d4822b7cf2d3c0cb8cdcb0",
|
||||
"sha256:aed4a9a7e3221b3e252c39d0bf794c438dc5453bc2963e8befe9d4cd324dff72",
|
||||
"sha256:aef9aee84ec78af51107181d02fe8773b100b01c5dfde351184ad9223eab3698",
|
||||
"sha256:b0221ca5a9837e040ebf61f48899926b5783668b7807419e4adae8175a31f773",
|
||||
"sha256:b4d7679a08fea64573c969f6994a2631908bb2c0e69a7235648642f3d2e39a68",
|
||||
"sha256:c250a7ec489b652c892e4f0a5d122cc14c3780f9f643e1a326754aedf82d9a76",
|
||||
"sha256:ca86db5b561b894f9e5f115d6a159fff2a2570a652e07889d8a383b5fae66eb4",
|
||||
"sha256:cfc523edecddaef56f6740d7de1ce24a2fdf94fd5e704091856a201872e37f9f",
|
||||
"sha256:d92272c7c16e105788efe2cfa5d680f07e34e0c29b03c1908f8636f55d5f915a",
|
||||
"sha256:da113b70f6ec40e7d81b43d1b139b9db6a05727ab8be1ee559f3a69854a69d34",
|
||||
"sha256:f6fac64a38f6768e7bc7b035b9e10d8a538a9fadce06b983fb3e6fa55ac5f5ce",
|
||||
"sha256:f8559617b1fcf59a9aedba2c9838b5b6aa211ffedecabca412b92a1ff75aac1a",
|
||||
"sha256:fbb42a541b1093385a2d8c7eec94d26d30437d0e77c1d25dae1dcc46741a385e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
|
||||
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.10.0"
|
||||
},
|
||||
"pycparser": {
|
||||
"hashes": [
|
||||
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
|
||||
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
},
|
||||
"pyjwt": {
|
||||
"extras": [
|
||||
"crypto"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
|
||||
"sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
|
||||
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.7"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
|
||||
"sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.2.5"
|
||||
},
|
||||
"pytest-forked": {
|
||||
"hashes": [
|
||||
"sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
|
||||
"sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"pytest-xdist": {
|
||||
"hashes": [
|
||||
"sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
|
||||
"sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.26.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
|
||||
"sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.7"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
|
||||
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.6.0"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"backports.entry-points-selectable": {
|
||||
"hashes": [
|
||||
"sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
|
||||
"sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
|
||||
],
|
||||
"markers": "python_version >= '2.7'",
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"distlib": {
|
||||
"hashes": [
|
||||
"sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
|
||||
"sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
|
||||
],
|
||||
"version": "==0.3.3"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
"sha256:7afc856f74fa7006a289fd10fa840e1eebd8bbff6bffb69c26c54a0512ea8cf8",
|
||||
"sha256:bb2a1c717df74c48a2d00ed625e5a66f8572a3a30baacb7657add1d7bac4097b"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.3.2"
|
||||
},
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
|
||||
"sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.0.1"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
|
||||
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.8.1"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"mypy": {
|
||||
"hashes": [
|
||||
"sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
|
||||
"sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
|
||||
"sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
|
||||
"sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
|
||||
"sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
|
||||
"sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
|
||||
"sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
|
||||
"sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
|
||||
"sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
|
||||
"sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
|
||||
"sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
|
||||
"sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
|
||||
"sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
|
||||
"sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
|
||||
"sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
|
||||
"sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
|
||||
"sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
|
||||
"sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
|
||||
"sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
|
||||
"sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
|
||||
"sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
|
||||
"sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
|
||||
"sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.910"
|
||||
},
|
||||
"mypy-extensions": {
|
||||
"hashes": [
|
||||
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
|
||||
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
|
||||
],
|
||||
"version": "==0.4.3"
|
||||
},
|
||||
"pipenv": {
|
||||
"hashes": [
|
||||
"sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
|
||||
"sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2021.5.29"
|
||||
},
|
||||
"platformdirs": {
|
||||
"hashes": [
|
||||
"sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
|
||||
"sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
|
||||
"sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==2.8.0"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
|
||||
"sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typed-ast": {
|
||||
"hashes": [
|
||||
"sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
|
||||
"sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
|
||||
"sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
|
||||
"sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
|
||||
"sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
|
||||
"sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
|
||||
"sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
|
||||
"sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
|
||||
"sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
|
||||
"sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
|
||||
"sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
|
||||
"sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
|
||||
"sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
|
||||
"sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
|
||||
"sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
|
||||
"sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
|
||||
"sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
|
||||
"sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
|
||||
"sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
|
||||
"sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
|
||||
"sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
|
||||
"sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
|
||||
"sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
|
||||
"sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
|
||||
"sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
|
||||
"sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
|
||||
"sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
|
||||
"sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
|
||||
"sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
|
||||
"sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==1.4.3"
|
||||
},
|
||||
"types-psycopg2": {
|
||||
"hashes": [
|
||||
"sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
|
||||
"sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"types-requests": {
|
||||
"hashes": [
|
||||
"sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
|
||||
"sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.25.11"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"virtualenv": {
|
||||
"hashes": [
|
||||
"sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814",
|
||||
"sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==20.10.0"
|
||||
},
|
||||
"virtualenv-clone": {
|
||||
"hashes": [
|
||||
"sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
|
||||
"sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.5.7"
|
||||
},
|
||||
"yapf": {
|
||||
"hashes": [
|
||||
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
|
||||
"sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.31.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
|
||||
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.6.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
1
Pipfile.lock
generated
Symbolic link
1
Pipfile.lock
generated
Symbolic link
@@ -0,0 +1 @@
|
||||
./test_runner/Pipfile.lock
|
||||
@@ -32,8 +32,8 @@ libssl-dev clang pkg-config libpq-dev
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.7 or higher), and install python3 packages using `pipenv install` in the project directory.
|
||||
To run the integration tests (not required to use the code), install
|
||||
Python (3.7 or higher), and install python3 packages with `pipenv` using `pipenv install` in the project directory.
|
||||
|
||||
2. Build zenith and patched postgres
|
||||
```sh
|
||||
@@ -129,7 +129,7 @@ INSERT 0 1
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
cd test_runner
|
||||
pipenv run pytest
|
||||
pytest
|
||||
```
|
||||
|
||||
## Documentation
|
||||
|
||||
@@ -287,15 +287,10 @@ impl PostgresNode {
|
||||
conf.append("max_replication_slots", "10");
|
||||
conf.append("hot_standby", "on");
|
||||
conf.append("shared_buffers", "1MB");
|
||||
conf.append("max_wal_size", "100GB");
|
||||
conf.append("fsync", "off");
|
||||
conf.append("max_connections", "100");
|
||||
conf.append("wal_sender_timeout", "0");
|
||||
conf.append("wal_level", "replica");
|
||||
// wal_sender_timeout is the maximum time to wait for WAL replication.
|
||||
// It also defines how often the walreciever will send a feedback message to the wal sender.
|
||||
//conf.append("wal_sender_timeout", "5s");
|
||||
//conf.append("max_replication_flush_lag", "160MB");
|
||||
//conf.append("max_replication_apply_lag", "1500MB");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
|
||||
|
||||
@@ -130,8 +130,9 @@ impl SafekeeperNode {
|
||||
let listen_pg = format!("localhost:{}", self.conf.pg_port);
|
||||
let listen_http = format!("localhost:{}", self.conf.http_port);
|
||||
|
||||
let mut cmd = Command::new(self.env.safekeeper_bin()?);
|
||||
cmd.args(&["-D", self.datadir_path().to_str().unwrap()])
|
||||
let mut cmd: &mut Command = &mut Command::new(self.env.safekeeper_bin()?);
|
||||
cmd = cmd
|
||||
.args(&["-D", self.datadir_path().to_str().unwrap()])
|
||||
.args(&["--listen-pg", &listen_pg])
|
||||
.args(&["--listen-http", &listen_http])
|
||||
.args(&["--pageserver", &pageserver_conn])
|
||||
@@ -140,18 +141,13 @@ impl SafekeeperNode {
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1");
|
||||
if !self.conf.sync {
|
||||
cmd.arg("--no-sync");
|
||||
cmd = cmd.arg("--no-sync");
|
||||
}
|
||||
|
||||
if self.env.pageserver.auth_type == AuthType::ZenithJWT {
|
||||
cmd.env("PAGESERVER_AUTH_TOKEN", &self.env.pageserver.auth_token);
|
||||
}
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Safekeeper failed to start. See '{}' for details.",
|
||||
|
||||
@@ -5,7 +5,7 @@ use std::process::Command;
|
||||
use std::time::Duration;
|
||||
use std::{io, result, thread};
|
||||
|
||||
use anyhow::bail;
|
||||
use anyhow::{anyhow, bail};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
@@ -97,6 +97,7 @@ impl PageServerNode {
|
||||
}
|
||||
|
||||
pub fn init(&self, create_tenant: Option<&str>) -> anyhow::Result<()> {
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let listen_pg = format!("localhost:{}", self.env.pageserver.pg_port);
|
||||
let listen_http = format!("localhost:{}", self.env.pageserver.http_port);
|
||||
let mut args = vec![
|
||||
@@ -121,19 +122,18 @@ impl PageServerNode {
|
||||
args.extend(&["--create-tenant", tenantid])
|
||||
}
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
cmd.args(args).env_clear().env("RUST_BACKTRACE", "1");
|
||||
let status = cmd
|
||||
.args(args)
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1")
|
||||
.status()
|
||||
.expect("pageserver init failed");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
if status.success() {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(anyhow!("pageserver init failed"))
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!("pageserver init failed");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn repo_path(&self) -> PathBuf {
|
||||
@@ -158,11 +158,6 @@ impl PageServerNode {
|
||||
.env_clear()
|
||||
.env("RUST_BACKTRACE", "1");
|
||||
|
||||
let var = "LLVM_PROFILE_FILE";
|
||||
if let Some(val) = std::env::var_os(var) {
|
||||
cmd.env(var, val);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
|
||||
@@ -51,14 +51,11 @@ Each PostgreSQL fork is considered a separate relish.
|
||||
|
||||
### Layer
|
||||
|
||||
A layer contains data needed to reconstruct any page versions within the
|
||||
layer's Segment and range of LSNs.
|
||||
|
||||
Each layer corresponds to the specific version of a relish Segment in a range of LSNs.
|
||||
There are two kinds of layers, in-memory and on-disk layers. In-memory
|
||||
layers are used to ingest incoming WAL, and provide fast access
|
||||
to the recent page versions. On-disk layers are stored as files on disk, and
|
||||
are immutable. See pageserver/src/layered_repository/README.md for more.
|
||||
|
||||
are immutable.
|
||||
### Layer file (on-disk layer)
|
||||
|
||||
Layered repository on-disk format is based on immutable files. The
|
||||
|
||||
128
docs/settings.md
128
docs/settings.md
@@ -1,128 +0,0 @@
|
||||
## Pageserver
|
||||
|
||||
### listen_pg_addr
|
||||
|
||||
Network interface and port number to listen at for connections from
|
||||
the compute nodes and safekeepers. The default is `127.0.0.1:64000`.
|
||||
|
||||
### listen_http_addr
|
||||
|
||||
Network interface and port number to listen at for admin connections.
|
||||
The default is `127.0.0.1:9898`.
|
||||
|
||||
### checkpoint_distance
|
||||
|
||||
`checkpoint_distance` is the amount of incoming WAL that is held in
|
||||
the open layer, before it's flushed to local disk. It puts an upper
|
||||
bound on how much WAL needs to be re-processed after a pageserver
|
||||
crash. It is a soft limit, the pageserver can momentarily go above it,
|
||||
but it will trigger a checkpoint operation to get it back below the
|
||||
limit.
|
||||
|
||||
`checkpoint_distance` also determines how much WAL needs to be kept
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
this much WAL, with some headroom, otherwise you can get stuck in a
|
||||
situation where the safekeeper is full and stops accepting new WAL,
|
||||
but the pageserver is not flushing out and releasing the space in the
|
||||
safekeeper because it hasn't reached checkpoint_distance yet.
|
||||
|
||||
`checkpoint_distance` also controls how often the WAL is uploaded to
|
||||
S3.
|
||||
|
||||
The unit is # of bytes.
|
||||
|
||||
### checkpoint_period
|
||||
|
||||
The pageserver checks whether `checkpoint_distance` has been reached
|
||||
every `checkpoint_period` seconds. Default is 1 s, which should be
|
||||
fine.
|
||||
|
||||
### gc_horizon
|
||||
|
||||
`gz_horizon` determines how much history is retained, to allow
|
||||
branching and read replicas at an older point in time. The unit is #
|
||||
of bytes of WAL. Page versions older than this are garbage collected
|
||||
away.
|
||||
|
||||
### gc_period
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
|
||||
### superuser
|
||||
|
||||
Name of the initial superuser role, passed to initdb when a new tenant
|
||||
is initialized. It doesn't affect anything after initialization. The
|
||||
default is Note: The default is 'zenith_admin', and the console
|
||||
depends on that, so if you change it, bad things will happen.
|
||||
|
||||
### page_cache_size
|
||||
|
||||
Size of the page cache, to hold materialized page versions. Unit is
|
||||
number of 8 kB blocks. The default is 8192, which means 64 MB.
|
||||
|
||||
### max_file_descriptors
|
||||
|
||||
Max number of file descriptors to hold open concurrently for accessing
|
||||
layer files. This should be kept well below the process/container/OS
|
||||
limit (see `ulimit -n`), as the pageserver also needs file descriptors
|
||||
for other files and for sockets for incoming connections.
|
||||
|
||||
### postgres-distrib
|
||||
|
||||
A directory with Postgres installation to use during pageserver activities.
|
||||
Inside that dir, a `bin/postgres` binary should be present.
|
||||
|
||||
The default distrib dir is `./tmp_install/`.
|
||||
|
||||
### workdir (-D)
|
||||
|
||||
A directory in the file system, where pageserver will store its files.
|
||||
The default is `./.zenith/`.
|
||||
|
||||
### Remote storage
|
||||
|
||||
There's a way to automatically backup and restore some of the pageserver's data from working dir to the remote storage.
|
||||
The backup system is disabled by default and can be enabled for either of the currently available storages:
|
||||
|
||||
#### Local FS storage
|
||||
|
||||
##### remote-storage-local-path
|
||||
|
||||
Pageserver can back up and restore some of its workdir contents to another directory.
|
||||
For that, only a path to that directory needs to be specified as a parameter.
|
||||
|
||||
#### S3 storage
|
||||
|
||||
Pageserver can back up and restore some of its workdir contents to S3.
|
||||
Full set of S3 credentials is needed for that as parameters:
|
||||
|
||||
##### remote-storage-s3-bucket
|
||||
|
||||
Name of the bucket to connect to, example: "some-sample-bucket".
|
||||
|
||||
##### remote-storage-region
|
||||
|
||||
Name of the region where the bucket is located at, example: "eu-north-1"
|
||||
|
||||
##### remote-storage-access-key
|
||||
|
||||
Access key to connect to the bucket ("login" part of the credentials), example: "AKIAIOSFODNN7EXAMPLE"
|
||||
|
||||
##### remote-storage-secret-access-key
|
||||
|
||||
Secret access key to connect to the bucket ("password" part of the credentials), example: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
|
||||
|
||||
#### General remote storage configuration
|
||||
|
||||
Pagesever allows only one remote storage configured concurrently and errors if parameters from multiple different remote configurations are used.
|
||||
No default values are used for the remote storage configuration parameters.
|
||||
|
||||
##### remote-storage-max-concurrent-sync
|
||||
|
||||
Max number of concurrent connections to open for uploading to or
|
||||
downloading from S3.
|
||||
The default value is 100.
|
||||
|
||||
## safekeeper
|
||||
|
||||
TODO
|
||||
@@ -79,61 +79,3 @@ Helpers for exposing Prometheus metrics from the server.
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
|
||||
A single virtual environment with all dependencies is described in the single `Pipfile`.
|
||||
|
||||
### Prerequisites
|
||||
- Install Python 3.7 (the minimal supported version)
|
||||
- Later version (e.g. 3.8) is ok if you don't write Python code
|
||||
- You can install Python 3.7 separately, e.g.:
|
||||
```bash
|
||||
# In Ubuntu
|
||||
sudo add-apt-repository ppa:deadsnakes/ppa
|
||||
sudo apt update
|
||||
sudo apt install python3.7
|
||||
```
|
||||
- Install `pipenv`
|
||||
- Exact version of `pipenv` is not important, you can use Debian/Ubuntu package `pipenv`.
|
||||
- Install dependencies via either
|
||||
* `pipenv --python 3.7 install --dev` if you will write Python code, or
|
||||
* `pipenv install` if you only want to run Python scripts and don't have Python 3.7.
|
||||
|
||||
Run `pipenv shell` to activate the virtual environment.
|
||||
Alternatively, use `pipenv run` to run a single command in the venv, e.g. `pipenv run pytest`.
|
||||
|
||||
### Obligatory checks
|
||||
We force code formatting via `yapf` and type hints via `mypy`.
|
||||
Run the following commands in the repository's root (next to `setup.cfg`):
|
||||
|
||||
```bash
|
||||
pipenv run yapf -ri . # All code is reformatted
|
||||
pipenv run mypy . # Ensure there are no typing errors
|
||||
```
|
||||
|
||||
**WARNING**: do not run `mypy` from a directory other than the root of the repository.
|
||||
Otherwise it will not find its configuration.
|
||||
|
||||
Also consider:
|
||||
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Adding more type hints to your code to avoid `Any`.
|
||||
|
||||
### Changing dependencies
|
||||
You have to update `Pipfile.lock` if you have changed `Pipfile`:
|
||||
|
||||
```bash
|
||||
pipenv --python 3.7 install --dev # Re-create venv for Python 3.7 and install recent pipenv inside
|
||||
pipenv run pipenv --version # Should be at least 2021.5.29
|
||||
pipenv run pipenv lock # Regenerate Pipfile.lock
|
||||
```
|
||||
|
||||
As the minimal supported version is Python 3.7 and we use it in CI,
|
||||
you have to use a Python 3.7 environment when updating `Pipfile.lock`.
|
||||
Otherwise some back-compatibility packages will be missing.
|
||||
|
||||
It is also important to run recent `pipenv`.
|
||||
Older versions remove markers from `Pipfile.lock`.
|
||||
|
||||
@@ -5,7 +5,7 @@ authors = ["Stas Kelvich <stas@zenith.tech>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
bookfile = { git = "https://github.com/zenithdb/bookfile.git", branch="generic-readext" }
|
||||
bookfile = "^0.3"
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
regex = "1.4.5"
|
||||
@@ -17,7 +17,7 @@ lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "2.33.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.11", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio = { version = "1.11", features = ["process", "macros", "fs", "rt", "io-util"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
@@ -32,15 +32,12 @@ serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
toml = "0.5"
|
||||
scopeguard = "1.1.0"
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
async-trait = "0.1"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
signal-hook = "0.3.10"
|
||||
signal-hook = {version = "0.3.10", features = ["extended-siginfo"] }
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
once_cell = "1.8.0"
|
||||
|
||||
rust-s3 = { version = "0.27.0-rc4", features = ["no-verify-ssl"] }
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
|
||||
@@ -129,29 +129,29 @@ There are the following implementations present:
|
||||
* local filesystem — to use in tests mainly
|
||||
* AWS S3 - to use in production
|
||||
|
||||
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs.
|
||||
Implementation details are covered in the [storage readme](./src/relish_storage/README.md) and corresponding Rust file docs.
|
||||
|
||||
The backup service is disabled by default and can be enabled to interact with a single remote storage.
|
||||
|
||||
CLI examples:
|
||||
* Local FS: `${PAGESERVER_BIN} --remote-storage-local-path="/some/local/path/"`
|
||||
* AWS S3 : `${PAGESERVER_BIN} --remote-storage-s3-bucket="some-sample-bucket" --remote-storage-region="eu-north-1" --remote-storage-access-key="SOMEKEYAAAAASADSAH*#" --remote-storage-secret-access-key="SOMEsEcReTsd292v"`
|
||||
* Local FS: `${PAGESERVER_BIN} --relish-storage-local-path="/some/local/path/"`
|
||||
* AWS S3 : `${PAGESERVER_BIN} --relish-storage-s3-bucket="some-sample-bucket" --relish-storage-region="eu-north-1" --relish-storage-access-key="SOMEKEYAAAAASADSAH*#" --relish-storage-secret-access-key="SOMEsEcReTsd292v"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
|
||||
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
|
||||
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup backup targets.
|
||||
Required sections are:
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
[relish_storage]
|
||||
local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
[relish_storage]
|
||||
bucket_name = 'some-sample-bucket'
|
||||
bucket_region = 'eu-north-1'
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use log::*;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
@@ -242,12 +242,10 @@ impl<'a> Basebackup<'a> {
|
||||
fn add_pgcontrol_file(&mut self) -> anyhow::Result<()> {
|
||||
let checkpoint_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)
|
||||
.context("failed to get checkpoint bytes")?;
|
||||
let pg_control_bytes = self
|
||||
.timeline
|
||||
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)
|
||||
.context("failed get control bytes")?;
|
||||
.get_page_at_lsn(RelishTag::Checkpoint, 0, self.lsn)?;
|
||||
let pg_control_bytes =
|
||||
self.timeline
|
||||
.get_page_at_lsn(RelishTag::ControlFile, 0, self.lsn)?;
|
||||
let mut pg_control = ControlFileData::decode(&pg_control_bytes)?;
|
||||
let mut checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
|
||||
|
||||
@@ -287,7 +285,11 @@ impl<'a> Basebackup<'a> {
|
||||
|
||||
//send wal segment
|
||||
let segno = self.lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_name = XLogFileName(
|
||||
1, // FIXME: always use Postgres timeline 1
|
||||
segno,
|
||||
pg_constants::WAL_SEGMENT_SIZE,
|
||||
);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
|
||||
|
||||
@@ -4,14 +4,11 @@
|
||||
use anyhow::Result;
|
||||
use clap::{App, Arg};
|
||||
use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
.about("Dump contents of one layer file, for debugging")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::with_name("path")
|
||||
.help("Path to file to dump")
|
||||
@@ -22,9 +19,6 @@ fn main() -> Result<()> {
|
||||
|
||||
let path = PathBuf::from(arg_matches.value_of("path").unwrap());
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(10);
|
||||
|
||||
dump_layerfile_from_path(&path)?;
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -5,27 +5,33 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::{
|
||||
env,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
net::TcpListener,
|
||||
path::{Path, PathBuf},
|
||||
str::FromStr,
|
||||
thread,
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType, tcp_listener, GIT_VERSION};
|
||||
use zenith_utils::{auth::JwtAuth, logging, postgres_backend::AuthType};
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use signal_hook::consts::signal::*;
|
||||
use signal_hook::consts::TERM_SIGNALS;
|
||||
use signal_hook::flag;
|
||||
use signal_hook::iterator::exfiltrator::WithOrigin;
|
||||
use signal_hook::iterator::SignalsInfo;
|
||||
use std::process::exit;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::Arc;
|
||||
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use pageserver::{
|
||||
branches, defaults::*, http, page_cache, page_service, remote_storage, tenant_mgr,
|
||||
virtual_file, PageServerConf, RemoteStorageConfig, RemoteStorageKind, S3Config, LOG_FILE_NAME,
|
||||
branches, defaults::*, http, page_service, relish_storage, tenant_mgr, PageServerConf,
|
||||
RelishStorageConfig, RelishStorageKind, S3Config, LOG_FILE_NAME,
|
||||
};
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::postgres_backend;
|
||||
use zenith_utils::shutdown::exit_now;
|
||||
use zenith_utils::signals::{self, Signal};
|
||||
|
||||
use const_format::formatcp;
|
||||
|
||||
@@ -38,29 +44,25 @@ struct CfgFileParams {
|
||||
checkpoint_period: Option<String>,
|
||||
gc_horizon: Option<String>,
|
||||
gc_period: Option<String>,
|
||||
open_mem_limit: Option<String>,
|
||||
page_cache_size: Option<String>,
|
||||
max_file_descriptors: Option<String>,
|
||||
pg_distrib_dir: Option<String>,
|
||||
auth_validation_public_key_path: Option<String>,
|
||||
auth_type: Option<String>,
|
||||
remote_storage_max_concurrent_sync: Option<String>,
|
||||
remote_storage_max_sync_errors: Option<String>,
|
||||
relish_storage_max_concurrent_sync: Option<String>,
|
||||
/////////////////////////////////
|
||||
//// Don't put `Option<String>` and other "simple" values below.
|
||||
////
|
||||
/// `Option<RemoteStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
|
||||
/// `Option<RelishStorage>` is a <a href='https://toml.io/en/v1.0.0#table'>table</a> in TOML.
|
||||
/// Values in TOML cannot be defined after tables (other tables can),
|
||||
/// and [`toml`] crate serializes all fields in the order of their appearance.
|
||||
////////////////////////////////
|
||||
remote_storage: Option<RemoteStorage>,
|
||||
relish_storage: Option<RelishStorage>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
|
||||
// Without this attribute, enums with values won't be serialized by the `toml` library (but can be deserialized nonetheless!).
|
||||
// See https://github.com/alexcrichton/toml-rs/blob/6c162e6562c3e432bf04c82a3d1d789d80761a86/examples/enum_external.rs for the examples
|
||||
#[serde(untagged)]
|
||||
enum RemoteStorage {
|
||||
enum RelishStorage {
|
||||
Local {
|
||||
local_path: String,
|
||||
},
|
||||
@@ -81,37 +83,33 @@ impl CfgFileParams {
|
||||
arg_matches.value_of(arg_name).map(str::to_owned)
|
||||
};
|
||||
|
||||
let remote_storage = if let Some(local_path) = get_arg("remote-storage-local-path") {
|
||||
Some(RemoteStorage::Local { local_path })
|
||||
let relish_storage = if let Some(local_path) = get_arg("relish-storage-local-path") {
|
||||
Some(RelishStorage::Local { local_path })
|
||||
} else if let Some((bucket_name, bucket_region)) =
|
||||
get_arg("remote-storage-s3-bucket").zip(get_arg("remote-storage-region"))
|
||||
get_arg("relish-storage-s3-bucket").zip(get_arg("relish-storage-region"))
|
||||
{
|
||||
Some(RemoteStorage::AwsS3 {
|
||||
Some(RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id: get_arg("remote-storage-access-key"),
|
||||
secret_access_key: get_arg("remote-storage-secret-access-key"),
|
||||
access_key_id: get_arg("relish-storage-access-key"),
|
||||
secret_access_key: get_arg("relish-storage-secret-access-key"),
|
||||
})
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Self {
|
||||
listen_pg_addr: get_arg("listen_pg_addr"),
|
||||
listen_http_addr: get_arg("listen_http_addr"),
|
||||
listen_pg_addr: get_arg("listen-pg"),
|
||||
listen_http_addr: get_arg("listen-http"),
|
||||
checkpoint_distance: get_arg("checkpoint_distance"),
|
||||
checkpoint_period: get_arg("checkpoint_period"),
|
||||
gc_horizon: get_arg("gc_horizon"),
|
||||
gc_period: get_arg("gc_period"),
|
||||
open_mem_limit: get_arg("open_mem_limit"),
|
||||
page_cache_size: get_arg("page_cache_size"),
|
||||
max_file_descriptors: get_arg("max_file_descriptors"),
|
||||
pg_distrib_dir: get_arg("postgres-distrib"),
|
||||
auth_validation_public_key_path: get_arg("auth-validation-public-key-path"),
|
||||
auth_type: get_arg("auth-type"),
|
||||
remote_storage,
|
||||
remote_storage_max_concurrent_sync: get_arg("remote-storage-max-concurrent-sync"),
|
||||
remote_storage_max_sync_errors: get_arg("remote-storage-max-sync-errors"),
|
||||
relish_storage,
|
||||
relish_storage_max_concurrent_sync: get_arg("relish-storage-max-concurrent-sync"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,21 +123,15 @@ impl CfgFileParams {
|
||||
checkpoint_period: self.checkpoint_period.or(other.checkpoint_period),
|
||||
gc_horizon: self.gc_horizon.or(other.gc_horizon),
|
||||
gc_period: self.gc_period.or(other.gc_period),
|
||||
open_mem_limit: self.open_mem_limit.or(other.open_mem_limit),
|
||||
page_cache_size: self.page_cache_size.or(other.page_cache_size),
|
||||
max_file_descriptors: self.max_file_descriptors.or(other.max_file_descriptors),
|
||||
pg_distrib_dir: self.pg_distrib_dir.or(other.pg_distrib_dir),
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.or(other.auth_validation_public_key_path),
|
||||
auth_type: self.auth_type.or(other.auth_type),
|
||||
remote_storage: self.remote_storage.or(other.remote_storage),
|
||||
remote_storage_max_concurrent_sync: self
|
||||
.remote_storage_max_concurrent_sync
|
||||
.or(other.remote_storage_max_concurrent_sync),
|
||||
remote_storage_max_sync_errors: self
|
||||
.remote_storage_max_sync_errors
|
||||
.or(other.remote_storage_max_sync_errors),
|
||||
relish_storage: self.relish_storage.or(other.relish_storage),
|
||||
relish_storage_max_concurrent_sync: self
|
||||
.relish_storage_max_concurrent_sync
|
||||
.or(other.relish_storage_max_concurrent_sync),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -175,21 +167,6 @@ impl CfgFileParams {
|
||||
None => DEFAULT_GC_PERIOD,
|
||||
};
|
||||
|
||||
let open_mem_limit: usize = match self.open_mem_limit.as_ref() {
|
||||
Some(open_mem_limit_str) => open_mem_limit_str.parse()?,
|
||||
None => DEFAULT_OPEN_MEM_LIMIT,
|
||||
};
|
||||
|
||||
let page_cache_size: usize = match self.page_cache_size.as_ref() {
|
||||
Some(page_cache_size_str) => page_cache_size_str.parse()?,
|
||||
None => DEFAULT_PAGE_CACHE_SIZE,
|
||||
};
|
||||
|
||||
let max_file_descriptors: usize = match self.max_file_descriptors.as_ref() {
|
||||
Some(max_file_descriptors_str) => max_file_descriptors_str.parse()?,
|
||||
None => DEFAULT_MAX_FILE_DESCRIPTORS,
|
||||
};
|
||||
|
||||
let pg_distrib_dir = match self.pg_distrib_dir.as_ref() {
|
||||
Some(pg_distrib_dir_str) => PathBuf::from(pg_distrib_dir_str),
|
||||
None => env::current_dir()?.join("tmp_install"),
|
||||
@@ -223,34 +200,31 @@ impl CfgFileParams {
|
||||
);
|
||||
}
|
||||
|
||||
let max_concurrent_sync = match self.remote_storage_max_concurrent_sync.as_deref() {
|
||||
Some(number_str) => number_str.parse()?,
|
||||
None => NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap(),
|
||||
let max_concurrent_sync = match self.relish_storage_max_concurrent_sync.as_deref() {
|
||||
Some(relish_storage_max_concurrent_sync) => {
|
||||
relish_storage_max_concurrent_sync.parse()?
|
||||
}
|
||||
None => DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS,
|
||||
};
|
||||
let max_sync_errors = match self.remote_storage_max_sync_errors.as_deref() {
|
||||
Some(number_str) => number_str.parse()?,
|
||||
None => NonZeroU32::new(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap(),
|
||||
};
|
||||
let remote_storage_config = self.remote_storage.as_ref().map(|storage_params| {
|
||||
let relish_storage_config = self.relish_storage.as_ref().map(|storage_params| {
|
||||
let storage = match storage_params.clone() {
|
||||
RemoteStorage::Local { local_path } => {
|
||||
RemoteStorageKind::LocalFs(PathBuf::from(local_path))
|
||||
RelishStorage::Local { local_path } => {
|
||||
RelishStorageKind::LocalFs(PathBuf::from(local_path))
|
||||
}
|
||||
RemoteStorage::AwsS3 {
|
||||
RelishStorage::AwsS3 {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
} => RemoteStorageKind::AwsS3(S3Config {
|
||||
} => RelishStorageKind::AwsS3(S3Config {
|
||||
bucket_name,
|
||||
bucket_region,
|
||||
access_key_id,
|
||||
secret_access_key,
|
||||
}),
|
||||
};
|
||||
RemoteStorageConfig {
|
||||
RelishStorageConfig {
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
}
|
||||
});
|
||||
@@ -264,9 +238,6 @@ impl CfgFileParams {
|
||||
checkpoint_period,
|
||||
gc_horizon,
|
||||
gc_period,
|
||||
open_mem_limit,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
|
||||
superuser: String::from(DEFAULT_SUPERUSER),
|
||||
|
||||
@@ -276,7 +247,7 @@ impl CfgFileParams {
|
||||
|
||||
auth_validation_public_key_path,
|
||||
auth_type,
|
||||
remote_storage_config,
|
||||
relish_storage_config,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -285,19 +256,18 @@ fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::with_name("listen_pg_addr")
|
||||
Arg::with_name("listen-pg")
|
||||
.short("l")
|
||||
.long("listen_pg_addr")
|
||||
.aliases(&["listen", "listen-pg"]) // keep some compatibility
|
||||
.long("listen-pg")
|
||||
.alias("listen") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help(formatcp!("listen for incoming page requests on ip:port (default: {DEFAULT_PG_LISTEN_ADDR})")),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("listen_http_addr")
|
||||
.long("listen_http_addr")
|
||||
.aliases(&["http_endpoint", "listen-http"]) // keep some compatibility
|
||||
Arg::with_name("listen-http")
|
||||
.long("listen-http")
|
||||
.alias("http_endpoint") // keep some compatibility
|
||||
.takes_value(true)
|
||||
.help(formatcp!("http endpoint address for metrics and management API calls on ip:port (default: {DEFAULT_HTTP_LISTEN_ADDR})")),
|
||||
)
|
||||
@@ -338,25 +308,6 @@ fn main() -> Result<()> {
|
||||
.takes_value(true)
|
||||
.help("Interval between garbage collector iterations"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("open_mem_limit")
|
||||
.long("open_mem_limit")
|
||||
.takes_value(true)
|
||||
.help("Amount of memory reserved for buffering incoming WAL"),
|
||||
)
|
||||
.arg(
|
||||
|
||||
Arg::with_name("page_cache_size")
|
||||
.long("page_cache_size")
|
||||
.takes_value(true)
|
||||
.help("Number of pages in the page cache"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("max_file_descriptors")
|
||||
.long("max_file_descriptors")
|
||||
.takes_value(true)
|
||||
.help("Max number of file descriptors to keep open for files"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("workdir")
|
||||
.short("D")
|
||||
@@ -390,45 +341,45 @@ fn main() -> Result<()> {
|
||||
.help("Authentication scheme type. One of: Trust, MD5, ZenithJWT"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-local-path")
|
||||
.long("remote-storage-local-path")
|
||||
Arg::with_name("relish-storage-local-path")
|
||||
.long("relish-storage-local-path")
|
||||
.takes_value(true)
|
||||
.help("Path to the local directory, to be used as an external remote storage")
|
||||
.help("Path to the local directory, to be used as an external relish storage")
|
||||
.conflicts_with_all(&[
|
||||
"remote-storage-s3-bucket",
|
||||
"remote-storage-region",
|
||||
"remote-storage-access-key",
|
||||
"remote-storage-secret-access-key",
|
||||
"relish-storage-s3-bucket",
|
||||
"relish-storage-region",
|
||||
"relish-storage-access-key",
|
||||
"relish-storage-secret-access-key",
|
||||
]),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-s3-bucket")
|
||||
.long("remote-storage-s3-bucket")
|
||||
Arg::with_name("relish-storage-s3-bucket")
|
||||
.long("relish-storage-s3-bucket")
|
||||
.takes_value(true)
|
||||
.help("Name of the AWS S3 bucket to use an external remote storage")
|
||||
.requires("remote-storage-region"),
|
||||
.help("Name of the AWS S3 bucket to use an external relish storage")
|
||||
.requires("relish-storage-region"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-region")
|
||||
.long("remote-storage-region")
|
||||
Arg::with_name("relish-storage-region")
|
||||
.long("relish-storage-region")
|
||||
.takes_value(true)
|
||||
.help("Region of the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-access-key")
|
||||
.long("remote-storage-access-key")
|
||||
Arg::with_name("relish-storage-access-key")
|
||||
.long("relish-storage-access-key")
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-secret-access-key")
|
||||
.long("remote-storage-secret-access-key")
|
||||
Arg::with_name("relish-storage-secret-access-key")
|
||||
.long("relish-storage-secret-access-key")
|
||||
.takes_value(true)
|
||||
.help("Credentials to access the AWS S3 bucket"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("remote-storage-max-concurrent-sync")
|
||||
.long("remote-storage-max-concurrent-sync")
|
||||
Arg::with_name("relish-storage-max-concurrent-sync")
|
||||
.long("relish-storage-max-concurrent-sync")
|
||||
.takes_value(true)
|
||||
.help("Maximum allowed concurrent synchronisations with storage"),
|
||||
)
|
||||
@@ -488,11 +439,6 @@ fn main() -> Result<()> {
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
|
||||
page_cache::init(conf);
|
||||
|
||||
// Create repo and exit if init was requested
|
||||
if init {
|
||||
branches::init_pageserver(conf, create_tenant).context("Failed to init pageserver")?;
|
||||
@@ -516,7 +462,16 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, conf.daemonize)?;
|
||||
|
||||
info!("version: {}", GIT_VERSION);
|
||||
let term_now = Arc::new(AtomicBool::new(false));
|
||||
for sig in TERM_SIGNALS {
|
||||
// When terminated by a second term signal, exit with exit code 1.
|
||||
// This will do nothing the first time (because term_now is false).
|
||||
flag::register_conditional_shutdown(*sig, 1, Arc::clone(&term_now))?;
|
||||
// But this will "arm" the above for the second time, by setting it to true.
|
||||
// The order of registering these is important, if you put this one first, it will
|
||||
// first arm and then terminate ‒ all in the first round.
|
||||
flag::register(*sig, Arc::clone(&term_now))?;
|
||||
}
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -525,15 +480,14 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
"Starting pageserver http handler on {}",
|
||||
conf.listen_http_addr
|
||||
);
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone())?;
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone())?;
|
||||
|
||||
info!(
|
||||
"Starting pageserver pg protocol handler on {}",
|
||||
conf.listen_pg_addr
|
||||
);
|
||||
let pageserver_listener = tcp_listener::bind(conf.listen_pg_addr.clone())?;
|
||||
let pageserver_listener = TcpListener::bind(conf.listen_pg_addr.clone())?;
|
||||
|
||||
// XXX: Don't spawn any threads before daemonizing!
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
@@ -548,21 +502,18 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
// XXX: The parent process should exit abruptly right after
|
||||
// it has spawned a child to prevent coverage machinery from
|
||||
// dumping stats into a `profraw` file now owned by the child.
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => error!(%err, "could not daemonize"),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
// keep join handles for spawned threads
|
||||
// don't spawn threads before daemonizing
|
||||
let mut join_handles = Vec::new();
|
||||
|
||||
if let Some(handle) = remote_storage::run_storage_sync_thread(conf)? {
|
||||
threads.push(handle);
|
||||
if let Some(handle) = relish_storage::run_storage_sync_thread(conf)? {
|
||||
join_handles.push(handle);
|
||||
}
|
||||
// Initialize tenant manager.
|
||||
tenant_mgr::init(conf);
|
||||
@@ -581,55 +532,61 @@ fn start_pageserver(conf: &'static PageServerConf) -> Result<()> {
|
||||
// Spawn a new thread for the http endpoint
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
let cloned = auth.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(move || {
|
||||
let router = http::make_router(conf, cloned);
|
||||
endpoint::serve_thread_main(router, http_listener)
|
||||
})?,
|
||||
);
|
||||
let http_endpoint_thread = thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(move || {
|
||||
let router = http::make_router(conf, cloned);
|
||||
endpoint::serve_thread_main(router, http_listener)
|
||||
})?;
|
||||
|
||||
join_handles.push(http_endpoint_thread);
|
||||
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
// for each connection.
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(move || {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?,
|
||||
);
|
||||
let page_service_thread = thread::Builder::new()
|
||||
.name("Page Service thread".into())
|
||||
.spawn(move || {
|
||||
page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type)
|
||||
})?;
|
||||
|
||||
signals.handle(|signal| match signal {
|
||||
Signal::Quit => {
|
||||
info!(
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
std::process::exit(111);
|
||||
}
|
||||
|
||||
Signal::Interrupt | Signal::Terminate => {
|
||||
info!(
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
|
||||
postgres_backend::set_pgbackend_shutdown_requested();
|
||||
tenant_mgr::shutdown_all_tenants()?;
|
||||
endpoint::shutdown();
|
||||
|
||||
for handle in std::mem::take(&mut threads) {
|
||||
handle
|
||||
for info in SignalsInfo::<WithOrigin>::new(TERM_SIGNALS)?.into_iter() {
|
||||
match info.signal {
|
||||
SIGQUIT => {
|
||||
info!("Got SIGQUIT. Terminate pageserver in immediate shutdown mode");
|
||||
exit(111);
|
||||
}
|
||||
SIGINT | SIGTERM => {
|
||||
info!("Got SIGINT/SIGTERM. Terminate gracefully in fast shutdown mode");
|
||||
// Terminate postgres backends
|
||||
postgres_backend::set_pgbackend_shutdown_requested();
|
||||
// Stop all tenants and flush their data
|
||||
tenant_mgr::shutdown_all_tenants()?;
|
||||
// Wait for pageservice thread to complete the job
|
||||
page_service_thread
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
}
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(0);
|
||||
// Shut down http router
|
||||
endpoint::shutdown();
|
||||
|
||||
// Wait for all threads
|
||||
for handle in join_handles.into_iter() {
|
||||
handle
|
||||
.join()
|
||||
.expect("thread panicked")
|
||||
.expect("thread exited with an error");
|
||||
}
|
||||
info!("Pageserver shut down successfully completed");
|
||||
exit(0);
|
||||
}
|
||||
unknown_signal => {
|
||||
debug!("Unknown signal {}", unknown_signal);
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -645,22 +602,16 @@ mod tests {
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
|
||||
page_cache_size: Some("page_cache_size_VALUE".to_string()),
|
||||
max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
),
|
||||
auth_type: Some("auth_type_VALUE".to_string()),
|
||||
remote_storage: Some(RemoteStorage::Local {
|
||||
local_path: "remote_storage_local_VALUE".to_string(),
|
||||
relish_storage: Some(RelishStorage::Local {
|
||||
local_path: "relish_storage_local_VALUE".to_string(),
|
||||
}),
|
||||
remote_storage_max_concurrent_sync: Some(
|
||||
"remote_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
remote_storage_max_sync_errors: Some(
|
||||
"remote_storage_max_sync_errors_VALUE".to_string(),
|
||||
relish_storage_max_concurrent_sync: Some(
|
||||
"relish_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
};
|
||||
|
||||
@@ -674,17 +625,13 @@ checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
open_mem_limit = 'open_mem_limit_VALUE'
|
||||
page_cache_size = 'page_cache_size_VALUE'
|
||||
max_file_descriptors = 'max_file_descriptors_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
|
||||
remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'
|
||||
relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
|
||||
|
||||
[remote_storage]
|
||||
local_path = 'remote_storage_local_VALUE'
|
||||
[relish_storage]
|
||||
local_path = 'relish_storage_local_VALUE'
|
||||
"#,
|
||||
toml_pretty_string
|
||||
);
|
||||
@@ -712,25 +659,19 @@ local_path = 'remote_storage_local_VALUE'
|
||||
checkpoint_period: Some("checkpoint_period_VALUE".to_string()),
|
||||
gc_horizon: Some("gc_horizon_VALUE".to_string()),
|
||||
gc_period: Some("gc_period_VALUE".to_string()),
|
||||
open_mem_limit: Some("open_mem_limit_VALUE".to_string()),
|
||||
page_cache_size: Some("page_cache_size_VALUE".to_string()),
|
||||
max_file_descriptors: Some("max_file_descriptors_VALUE".to_string()),
|
||||
pg_distrib_dir: Some("pg_distrib_dir_VALUE".to_string()),
|
||||
auth_validation_public_key_path: Some(
|
||||
"auth_validation_public_key_path_VALUE".to_string(),
|
||||
),
|
||||
auth_type: Some("auth_type_VALUE".to_string()),
|
||||
remote_storage: Some(RemoteStorage::AwsS3 {
|
||||
relish_storage: Some(RelishStorage::AwsS3 {
|
||||
bucket_name: "bucket_name_VALUE".to_string(),
|
||||
bucket_region: "bucket_region_VALUE".to_string(),
|
||||
access_key_id: Some("access_key_id_VALUE".to_string()),
|
||||
secret_access_key: Some("secret_access_key_VALUE".to_string()),
|
||||
}),
|
||||
remote_storage_max_concurrent_sync: Some(
|
||||
"remote_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
remote_storage_max_sync_errors: Some(
|
||||
"remote_storage_max_sync_errors_VALUE".to_string(),
|
||||
relish_storage_max_concurrent_sync: Some(
|
||||
"relish_storage_max_concurrent_sync_VALUE".to_string(),
|
||||
),
|
||||
};
|
||||
|
||||
@@ -744,16 +685,12 @@ checkpoint_distance = 'checkpoint_distance_VALUE'
|
||||
checkpoint_period = 'checkpoint_period_VALUE'
|
||||
gc_horizon = 'gc_horizon_VALUE'
|
||||
gc_period = 'gc_period_VALUE'
|
||||
open_mem_limit = 'open_mem_limit_VALUE'
|
||||
page_cache_size = 'page_cache_size_VALUE'
|
||||
max_file_descriptors = 'max_file_descriptors_VALUE'
|
||||
pg_distrib_dir = 'pg_distrib_dir_VALUE'
|
||||
auth_validation_public_key_path = 'auth_validation_public_key_path_VALUE'
|
||||
auth_type = 'auth_type_VALUE'
|
||||
remote_storage_max_concurrent_sync = 'remote_storage_max_concurrent_sync_VALUE'
|
||||
remote_storage_max_sync_errors = 'remote_storage_max_sync_errors_VALUE'
|
||||
relish_storage_max_concurrent_sync = 'relish_storage_max_concurrent_sync_VALUE'
|
||||
|
||||
[remote_storage]
|
||||
[relish_storage]
|
||||
bucket_name = 'bucket_name_VALUE'
|
||||
bucket_region = 'bucket_region_VALUE'
|
||||
"#,
|
||||
@@ -766,7 +703,7 @@ bucket_region = 'bucket_region_VALUE'
|
||||
.expect("Failed to deserialize the prettified serialization result of the config");
|
||||
|
||||
let mut expected_params = params;
|
||||
expected_params.remote_storage = Some(RemoteStorage::AwsS3 {
|
||||
expected_params.relish_storage = Some(RelishStorage::AwsS3 {
|
||||
bucket_name: "bucket_name_VALUE".to_string(),
|
||||
bucket_region: "bucket_region_VALUE".to_string(),
|
||||
access_key_id: None,
|
||||
|
||||
@@ -42,6 +42,8 @@ pub struct BranchInfo {
|
||||
impl BranchInfo {
|
||||
pub fn from_path<T: AsRef<Path>>(
|
||||
path: T,
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
repo: &Arc<dyn Repository>,
|
||||
include_non_incremental_logical_size: bool,
|
||||
) -> Result<Self> {
|
||||
@@ -56,14 +58,27 @@ impl BranchInfo {
|
||||
|
||||
let timeline = repo.get_timeline(timeline_id)?;
|
||||
|
||||
// we use ancestor lsn zero if we don't have an ancestor, so turn this into an option based on timeline id
|
||||
let (ancestor_id, ancestor_lsn) = match timeline.get_ancestor_timeline_id() {
|
||||
Some(ancestor_id) => (
|
||||
Some(ancestor_id.to_string()),
|
||||
Some(timeline.get_ancestor_lsn().to_string()),
|
||||
),
|
||||
None => (None, None),
|
||||
};
|
||||
let ancestor_path = conf.ancestor_path(&timeline_id, tenantid);
|
||||
let mut ancestor_id: Option<String> = None;
|
||||
let mut ancestor_lsn: Option<String> = None;
|
||||
|
||||
if ancestor_path.exists() {
|
||||
let ancestor = std::fs::read_to_string(ancestor_path)?;
|
||||
let mut strings = ancestor.split('@');
|
||||
|
||||
ancestor_id = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
ancestor_lsn = Some(
|
||||
strings
|
||||
.next()
|
||||
.with_context(|| "wrong branch ancestor point in time format")?
|
||||
.to_owned(),
|
||||
);
|
||||
}
|
||||
|
||||
// non incremental size calculation can be heavy, so let it be optional
|
||||
// needed for tests to check size calculation
|
||||
@@ -139,11 +154,7 @@ pub fn create_repo(
|
||||
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// create a new timeline directory
|
||||
let timeline_id = ZTimelineId::generate();
|
||||
let timelinedir = conf.timeline_path(&timeline_id, &tenantid);
|
||||
|
||||
crashsafe_dir::create_dir(&timelinedir)?;
|
||||
let tli = create_timeline(conf, None, &tenantid)?;
|
||||
|
||||
let repo = Arc::new(crate::layered_repository::LayeredRepository::new(
|
||||
conf,
|
||||
@@ -155,7 +166,7 @@ pub fn create_repo(
|
||||
// Load data into pageserver
|
||||
// TODO To implement zenith import we need to
|
||||
// move data loading out of create_repo()
|
||||
bootstrap_timeline(conf, tenantid, timeline_id, repo.as_ref())?;
|
||||
bootstrap_timeline(conf, tenantid, tli, repo.as_ref())?;
|
||||
|
||||
Ok(repo)
|
||||
}
|
||||
@@ -222,9 +233,7 @@ fn bootstrap_timeline(
|
||||
|
||||
// Import the contents of the data directory at the initial checkpoint
|
||||
// LSN, and any WAL after that.
|
||||
// Initdb lsn will be equal to last_record_lsn which will be set after import.
|
||||
// Because we know it upfront avoid having an option or dummy zero value by passing it to create_empty_timeline.
|
||||
let timeline = repo.create_empty_timeline(tli, lsn)?;
|
||||
let timeline = repo.create_empty_timeline(tli)?;
|
||||
restore_local_repo::import_timeline_from_postgres_datadir(
|
||||
&pgdata_path,
|
||||
timeline.writer().as_ref(),
|
||||
@@ -259,24 +268,13 @@ pub(crate) fn get_branches(
|
||||
// with timeline_id.
|
||||
let branches_dir = conf.branches_path(tenantid);
|
||||
|
||||
std::fs::read_dir(&branches_dir)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Found no branches directory '{}' for tenant {}",
|
||||
branches_dir.display(),
|
||||
tenantid
|
||||
)
|
||||
})?
|
||||
std::fs::read_dir(&branches_dir)?
|
||||
.map(|dir_entry_res| {
|
||||
let dir_entry = dir_entry_res.with_context(|| {
|
||||
format!(
|
||||
"Failed to list branches directory '{}' content for tenant {}",
|
||||
branches_dir.display(),
|
||||
tenantid
|
||||
)
|
||||
})?;
|
||||
let dir_entry = dir_entry_res?;
|
||||
BranchInfo::from_path(
|
||||
dir_entry.path(),
|
||||
conf,
|
||||
tenantid,
|
||||
&repo,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
@@ -322,24 +320,24 @@ pub(crate) fn create_branch(
|
||||
);
|
||||
}
|
||||
|
||||
let new_timeline_id = ZTimelineId::generate();
|
||||
// create a new timeline directory for it
|
||||
let newtli = create_timeline(conf, Some(startpoint), tenantid)?;
|
||||
|
||||
// Forward entire timeline creation routine to repository
|
||||
// backend, so it can do all needed initialization
|
||||
repo.branch_timeline(startpoint.timelineid, new_timeline_id, startpoint.lsn)?;
|
||||
// Let the Repository backend do its initialization
|
||||
repo.branch_timeline(startpoint.timelineid, newtli, startpoint.lsn)?;
|
||||
|
||||
// Remember the human-readable branch name for the new timeline.
|
||||
// FIXME: there's a race condition, if you create a branch with the same
|
||||
// name concurrently.
|
||||
let data = new_timeline_id.to_string();
|
||||
let data = newtli.to_string();
|
||||
fs::write(conf.branch_path(branchname, tenantid), data)?;
|
||||
|
||||
Ok(BranchInfo {
|
||||
name: branchname.to_string(),
|
||||
timeline_id: new_timeline_id,
|
||||
timeline_id: newtli,
|
||||
latest_valid_lsn: startpoint.lsn,
|
||||
ancestor_id: Some(startpoint.timelineid.to_string()),
|
||||
ancestor_lsn: Some(startpoint.lsn.to_string()),
|
||||
ancestor_id: None,
|
||||
ancestor_lsn: None,
|
||||
current_logical_size: 0,
|
||||
current_logical_size_non_incremental: Some(0),
|
||||
})
|
||||
@@ -417,3 +415,24 @@ fn parse_point_in_time(
|
||||
|
||||
bail!("could not parse point-in-time {}", s);
|
||||
}
|
||||
|
||||
fn create_timeline(
|
||||
conf: &PageServerConf,
|
||||
ancestor: Option<PointInTime>,
|
||||
tenantid: &ZTenantId,
|
||||
) -> Result<ZTimelineId> {
|
||||
// Create initial timeline
|
||||
|
||||
let timelineid = ZTimelineId::generate();
|
||||
|
||||
let timelinedir = conf.timeline_path(&timelineid, tenantid);
|
||||
|
||||
fs::create_dir(&timelinedir)?;
|
||||
|
||||
if let Some(ancestor) = ancestor {
|
||||
let data = format!("{}@{}", ancestor.timelineid, ancestor.lsn);
|
||||
fs::write(timelinedir.join("ancestor"), data)?;
|
||||
}
|
||||
|
||||
Ok(timelineid)
|
||||
}
|
||||
|
||||
@@ -132,7 +132,13 @@ async fn branch_detail_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("branch_detail", tenant = %tenantid, branch=%branch_name).entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
BranchInfo::from_path(path, &repo, include_non_incremental_logical_size)
|
||||
BranchInfo::from_path(
|
||||
path,
|
||||
conf,
|
||||
&tenantid,
|
||||
&repo,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
@@ -16,25 +16,24 @@ use bookfile::Book;
|
||||
use bytes::Bytes;
|
||||
use lazy_static::lazy_static;
|
||||
use postgres_ffi::pg_constants::BLCKSZ;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
|
||||
use std::cmp;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::ops::{Bound::Included, Deref};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{self, AtomicUsize};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use self::metadata::{metadata_path, TimelineMetadata};
|
||||
use crate::page_cache;
|
||||
use crate::relish::*;
|
||||
use crate::remote_storage::schedule_timeline_checkpoint_upload;
|
||||
use crate::relish_storage::schedule_timeline_upload;
|
||||
use crate::repository::{GcResult, Repository, Timeline, TimelineWriter, WALRecord};
|
||||
use crate::tenant_mgr;
|
||||
use crate::walreceiver;
|
||||
@@ -48,35 +47,30 @@ use zenith_metrics::{
|
||||
register_histogram, register_int_gauge_vec, Histogram, IntGauge, IntGaugeVec,
|
||||
};
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::crashsafe_dir;
|
||||
use zenith_utils::lsn::{AtomicLsn, Lsn, RecordLsn};
|
||||
use zenith_utils::seqwait::SeqWait;
|
||||
|
||||
mod blob;
|
||||
mod delta_layer;
|
||||
mod ephemeral_file;
|
||||
mod filename;
|
||||
mod global_layer_map;
|
||||
mod image_layer;
|
||||
mod inmemory_layer;
|
||||
mod interval_tree;
|
||||
mod layer_map;
|
||||
pub mod metadata;
|
||||
mod page_versions;
|
||||
mod storage_layer;
|
||||
|
||||
use delta_layer::DeltaLayer;
|
||||
use image_layer::ImageLayer;
|
||||
|
||||
use global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
|
||||
use inmemory_layer::InMemoryLayer;
|
||||
use layer_map::LayerMap;
|
||||
use storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, SegmentTag, RELISH_SEG_SIZE,
|
||||
};
|
||||
|
||||
pub use crate::layered_repository::ephemeral_file::writeback as writeback_ephemeral_file;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
@@ -117,6 +111,8 @@ lazy_static! {
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
/// Parts of the `.zenith/tenants/<tenantid>/timelines/<timelineid>` directory prefix.
|
||||
pub const TENANTS_SEGMENT_NAME: &str = "tenants";
|
||||
pub const TIMELINES_SEGMENT_NAME: &str = "timelines";
|
||||
@@ -143,17 +139,18 @@ impl Repository for LayeredRepository {
|
||||
Ok(self.get_timeline_locked(timelineid, &mut timelines)?)
|
||||
}
|
||||
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Result<Arc<dyn Timeline>> {
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Create the timeline directory, and write initial metadata to file.
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenantid))?;
|
||||
|
||||
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), Lsn(0), initdb_lsn);
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
prev_record_lsn: None,
|
||||
ancestor_timeline: None,
|
||||
ancestor_lsn: Lsn(0),
|
||||
};
|
||||
Self::save_metadata(self.conf, timelineid, self.tenantid, &metadata, true)?;
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
@@ -175,15 +172,7 @@ impl Repository for LayeredRepository {
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> {
|
||||
// We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
|
||||
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC
|
||||
// concurrently removes data that is needed by the new timeline.
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let src_timeline = self.get_timeline_locked(src, &mut timelines)?;
|
||||
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn)
|
||||
.context("invalid branch start lsn")?;
|
||||
let src_timeline = self.get_timeline(src)?;
|
||||
|
||||
let RecordLsn {
|
||||
last: src_last,
|
||||
@@ -197,22 +186,15 @@ impl Repository for LayeredRepository {
|
||||
None
|
||||
};
|
||||
|
||||
// create a new timeline directory
|
||||
let timelinedir = self.conf.timeline_path(&dst, &self.tenantid);
|
||||
|
||||
crashsafe_dir::create_dir(&timelinedir)?;
|
||||
|
||||
// Create the metadata file, noting the ancestor of the new timeline.
|
||||
// There is initially no data in it, but all the read-calls know to look
|
||||
// into the ancestor.
|
||||
let metadata = TimelineMetadata::new(
|
||||
start_lsn,
|
||||
dst_prev,
|
||||
Some(src),
|
||||
start_lsn,
|
||||
src_timeline.latest_gc_cutoff_lsn.load(),
|
||||
src_timeline.initdb_lsn,
|
||||
);
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: start_lsn,
|
||||
prev_record_lsn: dst_prev,
|
||||
ancestor_timeline: Some(src),
|
||||
ancestor_lsn: start_lsn,
|
||||
};
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&dst, &self.tenantid))?;
|
||||
Self::save_metadata(self.conf, dst, self.tenantid, &metadata, true)?;
|
||||
|
||||
@@ -238,22 +220,16 @@ impl Repository for LayeredRepository {
|
||||
}
|
||||
|
||||
fn checkpoint_iteration(&self, cconf: CheckpointConfig) -> Result<()> {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
// checkpoints. We don't want to block everything else while the
|
||||
// checkpoint runs.
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_checkpoint: Vec<(ZTimelineId, Arc<LayeredTimeline>)> = timelines
|
||||
.iter()
|
||||
.map(|(timelineid, timeline)| (*timelineid, timeline.clone()))
|
||||
.collect();
|
||||
drop(timelines);
|
||||
{
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
|
||||
for (timelineid, timeline) in timelines_to_checkpoint.iter() {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid).entered();
|
||||
for (timelineid, timeline) in timelines.iter() {
|
||||
let _entered =
|
||||
info_span!("checkpoint", timeline = %timelineid, tenant = %self.tenantid)
|
||||
.entered();
|
||||
|
||||
timeline.checkpoint(cconf)?;
|
||||
timeline.checkpoint(cconf)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -265,37 +241,16 @@ impl Repository for LayeredRepository {
|
||||
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
for (timelineid, timeline) in timelines.iter() {
|
||||
shutdown_timeline(*timelineid, timeline.as_ref())?;
|
||||
walreceiver::stop_wal_receiver(*timelineid);
|
||||
// Wait for syncing data to disk
|
||||
trace!("repo shutdown. checkpoint timeline {}", timelineid);
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
//TODO Wait for walredo process to shutdown too
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unload_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let removed_timeline = match timelines.remove(&timeline_id) {
|
||||
Some(timeline) => timeline,
|
||||
None => {
|
||||
warn!("Timeline {} not found, nothing to remove", timeline_id);
|
||||
return Ok(());
|
||||
}
|
||||
};
|
||||
drop(timelines);
|
||||
shutdown_timeline(timeline_id, removed_timeline.as_ref())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn shutdown_timeline(
|
||||
timelineid: ZTimelineId,
|
||||
timeline: &LayeredTimeline,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
walreceiver::stop_wal_receiver(timelineid);
|
||||
trace!("repo shutdown. checkpoint timeline {}", timelineid);
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
//TODO Wait for walredo process to shutdown too
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Private functions
|
||||
@@ -310,16 +265,15 @@ impl LayeredRepository {
|
||||
match timelines.get(&timelineid) {
|
||||
Some(timeline) => Ok(timeline.clone()),
|
||||
None => {
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)
|
||||
.context("failed to load metadata")?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn();
|
||||
let metadata = Self::load_metadata(self.conf, timelineid, self.tenantid)?;
|
||||
let disk_consistent_lsn = metadata.disk_consistent_lsn;
|
||||
|
||||
// Recurse to look up the ancestor timeline.
|
||||
//
|
||||
// TODO: If you have a very deep timeline history, this could become
|
||||
// expensive. Perhaps delay this until we need to look up a page in
|
||||
// ancestor.
|
||||
let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline() {
|
||||
let ancestor = if let Some(ancestor_timelineid) = metadata.ancestor_timeline {
|
||||
Some(self.get_timeline_locked(ancestor_timelineid, timelines)?)
|
||||
} else {
|
||||
None
|
||||
@@ -341,16 +295,9 @@ impl LayeredRepository {
|
||||
)?;
|
||||
|
||||
// List the layers on disk, and load them into the layer map
|
||||
let loaded_layers = timeline
|
||||
.load_layer_map(disk_consistent_lsn)
|
||||
.context("failed to load layermap")?;
|
||||
let loaded_layers = timeline.load_layer_map(disk_consistent_lsn)?;
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_checkpoint_upload(
|
||||
self.tenantid,
|
||||
timelineid,
|
||||
loaded_layers,
|
||||
metadata,
|
||||
);
|
||||
schedule_timeline_upload(self.tenantid, timelineid, loaded_layers, metadata);
|
||||
}
|
||||
|
||||
// needs to be after load_layer_map
|
||||
@@ -420,7 +367,6 @@ impl LayeredRepository {
|
||||
tenantid: ZTenantId,
|
||||
) -> Result<TimelineMetadata> {
|
||||
let path = metadata_path(conf, timelineid, tenantid);
|
||||
info!("loading metadata from {}", path.display());
|
||||
let metadata_bytes = std::fs::read(&path)?;
|
||||
TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
}
|
||||
@@ -547,6 +493,66 @@ impl LayeredRepository {
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TimelineMetadata {
|
||||
/// [`Lsn`] that corresponds to the corresponding timeline directory
|
||||
/// contents, stored locally in the pageserver workdir.
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
|
||||
// lot. We only store it in the metadata file when we flush *all* the
|
||||
// in-memory data so that 'last_record_lsn' is the same as
|
||||
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
|
||||
// soon as we reprocess at least one record, we will have a valid
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
pub prev_record_lsn: Option<Lsn>,
|
||||
|
||||
pub ancestor_timeline: Option<ZTimelineId>,
|
||||
pub ancestor_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
let data = TimelineMetadata::des_prefix(data)?;
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let mut metadata_bytes = TimelineMetadata::ser(self)?;
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LayeredTimeline {
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
@@ -615,17 +621,6 @@ pub struct LayeredTimeline {
|
||||
/// Must always be acquired before the layer map/individual layer lock
|
||||
/// to avoid deadlock.
|
||||
write_lock: Mutex<()>,
|
||||
|
||||
// Needed to ensure that we can't create a branch at a point that was already garbage collected
|
||||
latest_gc_cutoff_lsn: AtomicLsn,
|
||||
|
||||
// It may change across major versions so for simplicity
|
||||
// keep it after running initdb for a timeline.
|
||||
// It is needed in checks when we want to error on some operations
|
||||
// when they are requested for pre-initdb lsn.
|
||||
// It can be unified with latest_gc_cutoff_lsn under some "first_valid_lsn",
|
||||
// though lets keep them both for better error visibility.
|
||||
initdb_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Public interface functions
|
||||
@@ -634,10 +629,6 @@ impl Timeline for LayeredTimeline {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
|
||||
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId> {
|
||||
self.ancestor_timeline.as_ref().map(|x| x.timelineid)
|
||||
}
|
||||
|
||||
/// Wait until WAL has been received up to the given LSN.
|
||||
fn wait_lsn(&self, lsn: Lsn) -> Result<()> {
|
||||
// This should never be called from the WAL receiver thread, because that could lead
|
||||
@@ -669,13 +660,6 @@ impl Timeline for LayeredTimeline {
|
||||
);
|
||||
}
|
||||
debug_assert!(lsn <= self.get_last_record_lsn());
|
||||
let latest_gc_cutoff_lsn = self.latest_gc_cutoff_lsn.load();
|
||||
// error instead of assert to simplify testing
|
||||
ensure!(
|
||||
lsn >= latest_gc_cutoff_lsn,
|
||||
"tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
|
||||
lsn, latest_gc_cutoff_lsn
|
||||
);
|
||||
|
||||
let seg = SegmentTag::from_blknum(rel, blknum);
|
||||
|
||||
@@ -683,16 +667,7 @@ impl Timeline for LayeredTimeline {
|
||||
RECONSTRUCT_TIME
|
||||
.observe_closure_duration(|| self.materialize_page(seg, blknum, lsn, &*layer))
|
||||
} else {
|
||||
// FIXME: This can happen if PostgreSQL extends a relation but never writes
|
||||
// the page. See https://github.com/zenithdb/zenith/issues/841
|
||||
//
|
||||
// Would be nice to detect that situation better.
|
||||
if seg.segno > 0 && self.get_rel_exists(rel, lsn)? {
|
||||
warn!("Page {} blk {} at {} not found", seg.rel, blknum, lsn);
|
||||
return Ok(ZERO_PAGE.clone());
|
||||
}
|
||||
|
||||
bail!("segment {} not found at {}", rel, lsn);
|
||||
bail!("relish {} not found at {}", rel, lsn);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -835,28 +810,6 @@ impl Timeline for LayeredTimeline {
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Validate lsn against initdb_lsn and latest_gc_cutoff_lsn.
|
||||
///
|
||||
fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()> {
|
||||
let initdb_lsn = self.initdb_lsn;
|
||||
ensure!(
|
||||
lsn >= initdb_lsn,
|
||||
"LSN {} is earlier than initdb lsn {}",
|
||||
lsn,
|
||||
initdb_lsn,
|
||||
);
|
||||
|
||||
let latest_gc_cutoff_lsn = self.latest_gc_cutoff_lsn.load();
|
||||
ensure!(
|
||||
lsn >= latest_gc_cutoff_lsn,
|
||||
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
|
||||
lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_last_record_lsn(&self) -> Lsn {
|
||||
self.last_record_lsn.load().last
|
||||
}
|
||||
@@ -878,7 +831,7 @@ impl Timeline for LayeredTimeline {
|
||||
}
|
||||
|
||||
fn get_current_logical_size(&self) -> usize {
|
||||
self.current_logical_size.load(atomic::Ordering::Acquire) as usize
|
||||
self.current_logical_size.load(Ordering::Acquire) as usize
|
||||
}
|
||||
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize> {
|
||||
@@ -918,10 +871,6 @@ impl Timeline for LayeredTimeline {
|
||||
_write_guard: self.write_lock.lock().unwrap(),
|
||||
})
|
||||
}
|
||||
|
||||
fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl LayeredTimeline {
|
||||
@@ -952,21 +901,18 @@ impl LayeredTimeline {
|
||||
|
||||
// initialize in-memory 'last_record_lsn' from 'disk_consistent_lsn'.
|
||||
last_record_lsn: SeqWait::new(RecordLsn {
|
||||
last: metadata.disk_consistent_lsn(),
|
||||
prev: metadata.prev_record_lsn().unwrap_or(Lsn(0)),
|
||||
last: metadata.disk_consistent_lsn,
|
||||
prev: metadata.prev_record_lsn.unwrap_or(Lsn(0)),
|
||||
}),
|
||||
disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn().0),
|
||||
disk_consistent_lsn: AtomicLsn::new(metadata.disk_consistent_lsn.0),
|
||||
|
||||
ancestor_timeline: ancestor,
|
||||
ancestor_lsn: metadata.ancestor_lsn(),
|
||||
ancestor_lsn: metadata.ancestor_lsn,
|
||||
current_logical_size: AtomicUsize::new(current_logical_size),
|
||||
current_logical_size_gauge,
|
||||
upload_relishes,
|
||||
|
||||
write_lock: Mutex::new(()),
|
||||
|
||||
latest_gc_cutoff_lsn: AtomicLsn::from(metadata.latest_gc_cutoff_lsn()),
|
||||
initdb_lsn: metadata.initdb_lsn(),
|
||||
};
|
||||
Ok(timeline)
|
||||
}
|
||||
@@ -1005,12 +951,7 @@ impl LayeredTimeline {
|
||||
|
||||
for filename in &deltafilenames {
|
||||
ensure!(filename.start_lsn < filename.end_lsn);
|
||||
// The end-LSN is exclusive, while disk_consistent_lsn is
|
||||
// inclusive. For example, if disk_consistent_lsn is 100, it is
|
||||
// OK for a delta layer to have end LSN 101, but if the end LSN
|
||||
// is 102, then it might not have been fully flushed to disk
|
||||
// before crash.
|
||||
if filename.end_lsn > disk_consistent_lsn + 1 {
|
||||
if filename.end_lsn > disk_consistent_lsn {
|
||||
warn!(
|
||||
"found future delta layer {} on timeline {}",
|
||||
filename, self.timelineid
|
||||
@@ -1036,7 +977,7 @@ impl LayeredTimeline {
|
||||
/// Used to init current logical size on startup
|
||||
///
|
||||
fn init_current_logical_size(&mut self) -> Result<()> {
|
||||
if self.current_logical_size.load(atomic::Ordering::Relaxed) != 0 {
|
||||
if self.current_logical_size.load(Ordering::Relaxed) != 0 {
|
||||
bail!("cannot init already initialized current logical size")
|
||||
};
|
||||
let lsn = self.get_last_record_lsn();
|
||||
@@ -1044,7 +985,7 @@ impl LayeredTimeline {
|
||||
AtomicUsize::new(self.get_current_logical_size_non_incremental(lsn)?);
|
||||
trace!(
|
||||
"current_logical_size initialized to {}",
|
||||
self.current_logical_size.load(atomic::Ordering::Relaxed)
|
||||
self.current_logical_size.load(Ordering::Relaxed)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -1271,10 +1212,8 @@ impl LayeredTimeline {
|
||||
// a lot of memory and/or aren't receiving much updates anymore.
|
||||
let mut disk_consistent_lsn = last_record_lsn;
|
||||
|
||||
let mut layer_uploads = Vec::new();
|
||||
while let Some((oldest_layer_id, oldest_layer, oldest_generation)) =
|
||||
layers.peek_oldest_open()
|
||||
{
|
||||
let mut layer_paths = Vec::new();
|
||||
while let Some((oldest_layer, oldest_generation)) = layers.peek_oldest_open() {
|
||||
let oldest_pending_lsn = oldest_layer.get_oldest_pending_lsn();
|
||||
|
||||
// Does this layer need freezing?
|
||||
@@ -1299,14 +1238,38 @@ impl LayeredTimeline {
|
||||
break;
|
||||
}
|
||||
|
||||
// Mark the layer as no longer accepting writes and record the end_lsn.
|
||||
// This happens in-place, no new layers are created now.
|
||||
// We call `get_last_record_lsn` again, which may be different from the
|
||||
// original load, as we may have released the write lock since then.
|
||||
oldest_layer.freeze(self.get_last_record_lsn());
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
layers.pop_oldest_open();
|
||||
layers.insert_historic(oldest_layer.clone());
|
||||
|
||||
// Write the now-frozen layer to disk. That could take a while, so release the lock while do it
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
let mut this_layer_uploads = self.evict_layer(oldest_layer_id)?;
|
||||
layer_uploads.append(&mut this_layer_uploads);
|
||||
let new_historics = oldest_layer.write_to_disk(self)?;
|
||||
|
||||
write_guard = self.write_lock.lock().unwrap();
|
||||
layers = self.layers.lock().unwrap();
|
||||
|
||||
// Finally, replace the frozen in-memory layer with the new on-disk layers
|
||||
layers.remove_historic(oldest_layer);
|
||||
|
||||
// Add the historics to the LayerMap
|
||||
for delta_layer in new_historics.delta_layers {
|
||||
layer_paths.push(delta_layer.path());
|
||||
layers.insert_historic(Arc::new(delta_layer));
|
||||
}
|
||||
for image_layer in new_historics.image_layers {
|
||||
layer_paths.push(image_layer.path());
|
||||
layers.insert_historic(Arc::new(image_layer));
|
||||
}
|
||||
}
|
||||
|
||||
// Call unload() on all frozen layers, to release memory.
|
||||
@@ -1319,7 +1282,12 @@ impl LayeredTimeline {
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
if !layer_uploads.is_empty() {
|
||||
if !layer_paths.is_empty() {
|
||||
for layer_path in &layer_paths {
|
||||
let file = File::open(layer_path)?;
|
||||
file.sync_all()?;
|
||||
}
|
||||
|
||||
// We must fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
let timeline_dir =
|
||||
@@ -1347,14 +1315,12 @@ impl LayeredTimeline {
|
||||
|
||||
let ancestor_timelineid = self.ancestor_timeline.as_ref().map(|x| x.timelineid);
|
||||
|
||||
let metadata = TimelineMetadata::new(
|
||||
let metadata = TimelineMetadata {
|
||||
disk_consistent_lsn,
|
||||
ondisk_prev_record_lsn,
|
||||
ancestor_timelineid,
|
||||
self.ancestor_lsn,
|
||||
self.latest_gc_cutoff_lsn.load(),
|
||||
self.initdb_lsn,
|
||||
);
|
||||
prev_record_lsn: ondisk_prev_record_lsn,
|
||||
ancestor_timeline: ancestor_timelineid,
|
||||
ancestor_lsn: self.ancestor_lsn,
|
||||
};
|
||||
|
||||
LayeredRepository::save_metadata(
|
||||
self.conf,
|
||||
@@ -1364,12 +1330,7 @@ impl LayeredTimeline {
|
||||
false,
|
||||
)?;
|
||||
if self.upload_relishes {
|
||||
schedule_timeline_checkpoint_upload(
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
layer_uploads,
|
||||
metadata,
|
||||
);
|
||||
schedule_timeline_upload(self.tenantid, self.timelineid, layer_paths, metadata);
|
||||
}
|
||||
|
||||
// Also update the in-memory copy
|
||||
@@ -1379,55 +1340,6 @@ impl LayeredTimeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn evict_layer(&self, layer_id: LayerId) -> Result<Vec<PathBuf>> {
|
||||
// Mark the layer as no longer accepting writes and record the end_lsn.
|
||||
// This happens in-place, no new layers are created now.
|
||||
// We call `get_last_record_lsn` again, which may be different from the
|
||||
// original load, as we may have released the write lock since then.
|
||||
|
||||
let mut write_guard = self.write_lock.lock().unwrap();
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
|
||||
let mut layer_uploads = Vec::new();
|
||||
|
||||
let global_layer_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
if let Some(oldest_layer) = global_layer_map.get(&layer_id) {
|
||||
drop(global_layer_map);
|
||||
oldest_layer.freeze(self.get_last_record_lsn());
|
||||
|
||||
// The layer is no longer open, update the layer map to reflect this.
|
||||
// We will replace it with on-disk historics below.
|
||||
layers.remove_open(layer_id);
|
||||
layers.insert_historic(oldest_layer.clone());
|
||||
|
||||
// Write the now-frozen layer to disk. That could take a while, so release the lock while do it
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
let new_historics = oldest_layer.write_to_disk(self)?;
|
||||
|
||||
write_guard = self.write_lock.lock().unwrap();
|
||||
layers = self.layers.lock().unwrap();
|
||||
|
||||
// Finally, replace the frozen in-memory layer with the new on-disk layers
|
||||
layers.remove_historic(oldest_layer);
|
||||
|
||||
// Add the historics to the LayerMap
|
||||
for delta_layer in new_historics.delta_layers {
|
||||
layer_uploads.push(delta_layer.path());
|
||||
layers.insert_historic(Arc::new(delta_layer));
|
||||
}
|
||||
for image_layer in new_historics.image_layers {
|
||||
layer_uploads.push(image_layer.path());
|
||||
layers.insert_historic(Arc::new(image_layer));
|
||||
}
|
||||
}
|
||||
drop(layers);
|
||||
drop(write_guard);
|
||||
|
||||
Ok(layer_uploads)
|
||||
}
|
||||
|
||||
///
|
||||
/// Garbage collect layer files on a timeline that are no longer needed.
|
||||
///
|
||||
@@ -1456,10 +1368,6 @@ impl LayeredTimeline {
|
||||
|
||||
let _enter = info_span!("garbage collection", timeline = %self.timelineid, tenant = %self.tenantid, cutoff = %cutoff).entered();
|
||||
|
||||
// We need to ensure that no one branches at a point before latest_gc_cutoff_lsn.
|
||||
// See branch_timeline() for details.
|
||||
self.latest_gc_cutoff_lsn.store(cutoff);
|
||||
|
||||
info!("GC starting");
|
||||
|
||||
debug!("retain_lsns: {:?}", retain_lsns);
|
||||
@@ -1476,16 +1384,6 @@ impl LayeredTimeline {
|
||||
//
|
||||
let mut layers = self.layers.lock().unwrap();
|
||||
'outer: for l in layers.iter_historic_layers() {
|
||||
// This layer is in the process of being flushed to disk.
|
||||
// It will be swapped out of the layer map, replaced with
|
||||
// on-disk layers containing the same data.
|
||||
// We can't GC it, as it's not on disk. We can't remove it
|
||||
// from the layer map yet, as it would make its data
|
||||
// inaccessible.
|
||||
if l.is_in_memory() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let seg = l.get_seg_tag();
|
||||
|
||||
if seg.rel.is_relation() {
|
||||
@@ -1512,21 +1410,15 @@ impl LayeredTimeline {
|
||||
}
|
||||
|
||||
// 2. Is it needed by a child branch?
|
||||
// NOTE With that wee would keep data that
|
||||
// might be referenced by child branches forever.
|
||||
// We can track this in child timeline GC and delete parent layers when
|
||||
// they are no longer needed. This might be complicated with long inheritance chains.
|
||||
for retain_lsn in &retain_lsns {
|
||||
// start_lsn is inclusive
|
||||
if &l.get_start_lsn() <= retain_lsn {
|
||||
// start_lsn is inclusive and end_lsn is exclusive
|
||||
if l.get_start_lsn() <= *retain_lsn && *retain_lsn < l.get_end_lsn() {
|
||||
info!(
|
||||
"keeping {} {}-{} because it's still might be referenced by child branch forked at {} is_dropped: {} is_incremental: {}",
|
||||
"keeping {} {}-{} because it's needed by branch point {}",
|
||||
seg,
|
||||
l.get_start_lsn(),
|
||||
l.get_end_lsn(),
|
||||
retain_lsn,
|
||||
l.is_dropped(),
|
||||
l.is_incremental(),
|
||||
*retain_lsn
|
||||
);
|
||||
if seg.rel.is_relation() {
|
||||
result.ondisk_relfiles_needed_by_branches += 1;
|
||||
@@ -1627,12 +1519,11 @@ impl LayeredTimeline {
|
||||
|
||||
// We didn't find any reason to keep this file, so remove it.
|
||||
info!(
|
||||
"garbage collecting {} {}-{} is_dropped: {} is_incremental: {}",
|
||||
"garbage collecting {} {}-{} {}",
|
||||
l.get_seg_tag(),
|
||||
l.get_start_lsn(),
|
||||
l.get_end_lsn(),
|
||||
l.is_dropped(),
|
||||
l.is_incremental(),
|
||||
l.is_dropped()
|
||||
);
|
||||
layers_to_remove.push(Arc::clone(&l));
|
||||
}
|
||||
@@ -1659,23 +1550,6 @@ impl LayeredTimeline {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn lookup_cached_page(&self, seg: &SegmentTag, blknum: u32, lsn: Lsn) -> Option<(Lsn, Bytes)> {
|
||||
let cache = page_cache::get();
|
||||
if let RelishTag::Relation(rel_tag) = &seg.rel {
|
||||
let (lsn, read_guard) = cache.lookup_materialized_page(
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
*rel_tag,
|
||||
blknum,
|
||||
lsn,
|
||||
)?;
|
||||
let img = Bytes::from(read_guard.to_vec());
|
||||
Some((lsn, img))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Reconstruct a page version from given Layer
|
||||
///
|
||||
@@ -1686,22 +1560,6 @@ impl LayeredTimeline {
|
||||
lsn: Lsn,
|
||||
layer: &dyn Layer,
|
||||
) -> Result<Bytes> {
|
||||
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
|
||||
// The cached image can be returned directly if there is no WAL between the cached image
|
||||
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
|
||||
// for redo.
|
||||
let (cached_lsn_opt, cached_page_opt) = match self.lookup_cached_page(&seg, blknum, lsn) {
|
||||
Some((cached_lsn, cached_img)) => {
|
||||
match cached_lsn.cmp(&lsn) {
|
||||
cmp::Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
|
||||
cmp::Ordering::Equal => return Ok(cached_img), // exact LSN match, return the image
|
||||
cmp::Ordering::Greater => panic!(), // the returned lsn should never be after the requested lsn
|
||||
}
|
||||
(Some(cached_lsn), Some((cached_lsn, cached_img)))
|
||||
}
|
||||
None => (None, None),
|
||||
};
|
||||
|
||||
let mut data = PageReconstructData {
|
||||
records: Vec::new(),
|
||||
page_img: None,
|
||||
@@ -1716,32 +1574,11 @@ impl LayeredTimeline {
|
||||
let mut layer_ref = layer;
|
||||
let mut curr_lsn = lsn;
|
||||
loop {
|
||||
let result = layer_ref
|
||||
.get_page_reconstruct_data(blknum, curr_lsn, cached_lsn_opt, &mut data)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get reconstruct data {} {:?} {} {} {:?}",
|
||||
layer_ref.get_seg_tag(),
|
||||
layer_ref.filename(),
|
||||
blknum,
|
||||
curr_lsn,
|
||||
cached_lsn_opt,
|
||||
)
|
||||
})?;
|
||||
match result {
|
||||
match layer_ref.get_page_reconstruct_data(blknum, curr_lsn, &mut data)? {
|
||||
PageReconstructResult::Complete => break,
|
||||
PageReconstructResult::Continue(cont_lsn) => {
|
||||
// Fetch base image / more WAL from the returned predecessor layer
|
||||
if let Some((cont_layer, cont_lsn)) = self.get_layer_for_read(seg, cont_lsn)? {
|
||||
if cont_lsn == curr_lsn {
|
||||
// We landed on the same layer again. Shouldn't happen, but if it does,
|
||||
// don't get stuck in an infinite loop.
|
||||
bail!(
|
||||
"could not find predecessor layer of segment {} at {}",
|
||||
seg.rel,
|
||||
cont_lsn
|
||||
);
|
||||
}
|
||||
layer_arc = cont_layer;
|
||||
layer_ref = &*layer_arc;
|
||||
curr_lsn = cont_lsn;
|
||||
@@ -1772,16 +1609,6 @@ impl LayeredTimeline {
|
||||
lsn,
|
||||
);
|
||||
}
|
||||
PageReconstructResult::Cached => {
|
||||
let (cached_lsn, cached_img) = cached_page_opt.unwrap();
|
||||
assert!(data.page_img.is_none());
|
||||
if let Some((first_rec_lsn, first_rec)) = data.records.first() {
|
||||
assert!(&cached_lsn < first_rec_lsn);
|
||||
assert!(!first_rec.will_init);
|
||||
}
|
||||
data.page_img = Some(cached_img);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1837,9 +1664,6 @@ impl LayeredTimeline {
|
||||
} else {
|
||||
trace!("found {} WAL records that will init the page for blk {} in {} at {}, performing WAL redo", data.records.len(), blknum, rel, request_lsn);
|
||||
}
|
||||
|
||||
let last_rec_lsn = data.records.last().unwrap().0;
|
||||
|
||||
let img = self.walredo_mgr.request_redo(
|
||||
rel,
|
||||
blknum,
|
||||
@@ -1848,18 +1672,6 @@ impl LayeredTimeline {
|
||||
data.records,
|
||||
)?;
|
||||
|
||||
if let RelishTag::Relation(rel_tag) = &rel {
|
||||
let cache = page_cache::get();
|
||||
cache.memorize_materialized_page(
|
||||
self.tenantid,
|
||||
self.timelineid,
|
||||
*rel_tag,
|
||||
blknum,
|
||||
last_rec_lsn,
|
||||
&img,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(img)
|
||||
}
|
||||
}
|
||||
@@ -1871,7 +1683,7 @@ impl LayeredTimeline {
|
||||
fn increase_current_logical_size(&self, diff: u32) {
|
||||
let val = self
|
||||
.current_logical_size
|
||||
.fetch_add(diff as usize, atomic::Ordering::SeqCst);
|
||||
.fetch_add(diff as usize, Ordering::SeqCst);
|
||||
trace!(
|
||||
"increase_current_logical_size: {} + {} = {}",
|
||||
val,
|
||||
@@ -1888,7 +1700,7 @@ impl LayeredTimeline {
|
||||
fn decrease_current_logical_size(&self, diff: u32) {
|
||||
let val = self
|
||||
.current_logical_size
|
||||
.fetch_sub(diff as usize, atomic::Ordering::SeqCst);
|
||||
.fetch_sub(diff as usize, Ordering::SeqCst);
|
||||
trace!(
|
||||
"decrease_current_logical_size: {} - {} = {}",
|
||||
val,
|
||||
@@ -1926,7 +1738,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
|
||||
let seg = SegmentTag::from_blknum(rel, blknum);
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
let delta_size = layer.put_wal_record(lsn, blknum, rec)?;
|
||||
let delta_size = layer.put_wal_record(lsn, blknum, rec);
|
||||
self.tl
|
||||
.increase_current_logical_size(delta_size * BLCKSZ as u32);
|
||||
Ok(())
|
||||
@@ -1945,7 +1757,7 @@ impl<'a> TimelineWriter for LayeredTimelineWriter<'a> {
|
||||
let seg = SegmentTag::from_blknum(rel, blknum);
|
||||
|
||||
let layer = self.tl.get_layer_for_write(seg, lsn)?;
|
||||
let delta_size = layer.put_page_image(blknum, lsn, img)?;
|
||||
let delta_size = layer.put_page_image(blknum, lsn, img);
|
||||
|
||||
self.tl
|
||||
.increase_current_logical_size(delta_size * BLCKSZ as u32);
|
||||
@@ -2076,6 +1888,15 @@ pub fn dump_layerfile_from_path(path: &Path) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Add a suffix to a layer file's name: .{num}.old
|
||||
/// Uses the first available num (starts at 0)
|
||||
fn rename_to_backup(path: PathBuf) -> anyhow::Result<()> {
|
||||
|
||||
@@ -1,56 +1,12 @@
|
||||
# Overview
|
||||
|
||||
The on-disk format is based on immutable files. The page server receives a
|
||||
stream of incoming WAL, parses the WAL records to determine which pages they
|
||||
apply to, and accumulates the incoming changes in memory. Every now and then,
|
||||
the accumulated changes are written out to new immutable files. This process is
|
||||
called checkpointing. Old versions of on-disk files that are not needed by any
|
||||
timeline are removed by GC process.
|
||||
|
||||
The main responsibility of the Page Server is to process the incoming WAL, and
|
||||
reprocess it into a format that allows reasonably quick access to any page
|
||||
version.
|
||||
|
||||
The incoming WAL contains updates to arbitrary pages in the system. The
|
||||
distribution depends on the workload: the updates could be totally random, or
|
||||
there could be a long stream of updates to a single relation when data is bulk
|
||||
loaded, for example, or something in between. The page server slices the
|
||||
incoming WAL per relation and page, and packages the sliced WAL into
|
||||
suitably-sized "layer files". The layer files contain all the history of the
|
||||
database, back to some reasonable retention period. This system replaces the
|
||||
base backups and the WAL archive used in a traditional PostgreSQL
|
||||
installation. The layer files are immutable, they are not modified in-place
|
||||
after creation. New layer files are created for new incoming WAL, and old layer
|
||||
files are removed when they are no longer needed. We could also replace layer
|
||||
files with new files that contain the same information, merging small files for
|
||||
example, but that hasn't been implemented yet.
|
||||
|
||||
|
||||
Cloud Storage Page Server Safekeeper
|
||||
Local disk Memory WAL
|
||||
|
||||
|AAAA| |AAAA|AAAA| |AA
|
||||
|BBBB| |BBBB|BBBB| |
|
||||
|CCCC|CCCC| <---- |CCCC|CCCC|CCCC| <--- |CC <---- ADEBAABED
|
||||
|DDDD|DDDD| |DDDD|DDDD| |DDD
|
||||
|EEEE| |EEEE|EEEE|EEEE| |E
|
||||
|
||||
|
||||
In this illustration, WAL is received as a stream from the Safekeeper, from the
|
||||
right. It is immediately captured by the page server and stored quickly in
|
||||
memory. The page server memory can be thought of as a quick "reorder buffer",
|
||||
used to hold the incoming WAL and reorder it so that we keep the WAL records for
|
||||
the same page and relation close to each other.
|
||||
|
||||
From the page server memory, whenever enough WAL has been accumulated for one
|
||||
relation segment, it is moved to local disk, as a new layer file, and the memory
|
||||
is released.
|
||||
|
||||
From the local disk, the layers are further copied to Cloud Storage, for
|
||||
long-term archival. After a layer has been copied to Cloud Storage, it can be
|
||||
removed from local disk, although we currently keep everything locally for fast
|
||||
access. If a layer is needed that isn't found locally, it is fetched from Cloud
|
||||
Storage and stored in local disk.
|
||||
The on-disk format is based on immutable files. The page server
|
||||
receives a stream of incoming WAL, parses the WAL records to determine
|
||||
which pages they apply to, and accumulates the incoming changes in
|
||||
memory. Every now and then, the accumulated changes are written out to
|
||||
new immutable files. This process is called checkpointing. Old versions
|
||||
of on-disk files that are not needed by any timeline are removed by GC
|
||||
process.
|
||||
|
||||
# Terms used in layered repository
|
||||
|
||||
@@ -58,9 +14,32 @@ Storage and stored in local disk.
|
||||
- Segment - one slice of a Relish that is stored in a LayeredTimeline.
|
||||
- Layer - specific version of a relish Segment in a range of LSNs.
|
||||
|
||||
# Layer map
|
||||
Layers can be InMemory or OnDisk:
|
||||
- InMemory layer is not durably stored and needs to rebuild from WAL on pageserver start.
|
||||
- OnDisk layer is durably stored.
|
||||
|
||||
The LayerMap tracks what layers exist for all the relishes in a timeline.
|
||||
OnDisk layers can be Image or Delta:
|
||||
- ImageLayer represents an image or a snapshot of a segment at one particular LSN.
|
||||
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs.
|
||||
|
||||
Dropped segments are always represented on disk by DeltaLayer.
|
||||
|
||||
LSN range defined by start_lsn and end_lsn:
|
||||
- start_lsn is inclusive.
|
||||
- end_lsn is exclusive.
|
||||
|
||||
For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen
|
||||
in-memory layer or a delta layer, it is a valid end bound. An image
|
||||
layer represents snapshot at one LSN, so end_lsn is always the
|
||||
snapshot LSN + 1
|
||||
|
||||
Layers can be open or historical:
|
||||
- Open layer is a writeable one. Only InMemory layer can be open.
|
||||
FIXME: If open layer is dropped, it is not writeable, so it should be turned into historical,
|
||||
but now it is not implemented - see bug #569.
|
||||
- Historical layer is the one that cannot be modified anymore. Now only OnDisk layers can be historical.
|
||||
|
||||
- LayerMap - a map that tracks what layers exist for all the relishes in a timeline.
|
||||
|
||||
LayerMap consists of two data structures:
|
||||
- segs - All the layers keyed by segment tag
|
||||
@@ -75,55 +54,8 @@ TODO: Are there any exceptions to this?
|
||||
For example, timeline.list_rels(lsn) will return all segments that are visible in this timeline at the LSN,
|
||||
including ones that were not modified in this timeline and thus don't have a layer in the timeline's LayerMap.
|
||||
|
||||
|
||||
# Different kinds of layers
|
||||
|
||||
A layer can be in different states:
|
||||
|
||||
- Open - a layer where new WAL records can be appended to.
|
||||
- Closed - a layer that is read-only, no new WAL records can be appended to it
|
||||
- Historic: synonym for closed
|
||||
- InMemory: A layer that needs to be rebuilt from WAL on pageserver start.
|
||||
To avoid OOM errors, InMemory layers can be spilled to disk into ephemeral file.
|
||||
- OnDisk: A layer that is stored on disk. If its end-LSN is older than
|
||||
disk_consistent_lsn, it is known to be fully flushed and fsync'd to local disk.
|
||||
- Frozen layer: an in-memory layer that is Closed.
|
||||
|
||||
TODO: Clarify the difference between Closed, Historic and Frozen.
|
||||
|
||||
There are two kinds of OnDisk layers:
|
||||
- ImageLayer represents an image or a snapshot of a 10 MB relish segment, at one particular LSN.
|
||||
- DeltaLayer represents a collection of WAL records or page images in a range of LSNs, for one
|
||||
relish segment.
|
||||
|
||||
Dropped segments are always represented on disk by DeltaLayer.
|
||||
|
||||
# Layer life cycle
|
||||
|
||||
LSN range defined by start_lsn and end_lsn:
|
||||
- start_lsn is inclusive.
|
||||
- end_lsn is exclusive.
|
||||
|
||||
For an open in-memory layer, the end_lsn is MAX_LSN. For a frozen in-memory
|
||||
layer or a delta layer, it is a valid end bound. An image layer represents
|
||||
snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
|
||||
|
||||
Every layer starts its life as an Open In-Memory layer. When the page server
|
||||
receives the first WAL record for a segment, it creates a new In-Memory layer
|
||||
for it, and puts it to the layer map. Later, the layer is old enough, its
|
||||
contents are written to disk, as On-Disk layers. This process is called
|
||||
"evicting" a layer.
|
||||
|
||||
Layer eviction is a two-step process: First, the layer is marked as closed, so
|
||||
that it no longer accepts new WAL records, and the layer map is updated
|
||||
accordingly. If a new WAL record for that segment arrives after this step, a new
|
||||
Open layer is created to hold it. After this first step, the layer is a Closed
|
||||
InMemory state. This first step is called "freezing" the layer.
|
||||
|
||||
In the second step, new Delta and Image layers are created, containing all the
|
||||
data in the Frozen InMemory layer. When the new layers are ready, the original
|
||||
frozen layer is replaced with the new layers in the layer map, and the original
|
||||
frozen layer is dropped, releasing the memory.
|
||||
TODO:
|
||||
Describe GC and checkpoint interval settings.
|
||||
|
||||
# Layer files (On-disk layers)
|
||||
|
||||
@@ -434,8 +366,6 @@ is a newer layer file there. TODO: This optimization hasn't been
|
||||
implemented! The GC algorithm will currently keep the file on the
|
||||
'main' branch anyway, for as long as the child branch exists.
|
||||
|
||||
TODO:
|
||||
Describe GC and checkpoint interval settings.
|
||||
|
||||
# TODO: On LSN ranges
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::io::{Read, Write};
|
||||
use std::os::unix::prelude::FileExt;
|
||||
use std::{fs::File, io::Write};
|
||||
|
||||
use anyhow::Result;
|
||||
use bookfile::{BookWriter, BoundedReader, ChapterId, ChapterWriter};
|
||||
@@ -11,7 +10,7 @@ pub struct BlobRange {
|
||||
size: usize,
|
||||
}
|
||||
|
||||
pub fn read_blob<F: FileExt>(reader: &BoundedReader<&'_ F>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
pub fn read_blob(reader: &BoundedReader<&'_ File>, range: &BlobRange) -> Result<Vec<u8>> {
|
||||
let mut buf = vec![0u8; range.size];
|
||||
reader.read_exact_at(&mut buf, range.offset)?;
|
||||
Ok(buf)
|
||||
@@ -29,14 +28,14 @@ impl<W: Write> BlobWriter<W> {
|
||||
Self { writer, offset: 0 }
|
||||
}
|
||||
|
||||
pub fn write_blob_from_reader(&mut self, r: &mut impl Read) -> Result<BlobRange> {
|
||||
let len = std::io::copy(r, &mut self.writer)?;
|
||||
pub fn write_blob(&mut self, blob: &[u8]) -> Result<BlobRange> {
|
||||
self.writer.write_all(blob)?;
|
||||
|
||||
let range = BlobRange {
|
||||
offset: self.offset,
|
||||
size: len as usize,
|
||||
size: blob.len(),
|
||||
};
|
||||
self.offset += len as u64;
|
||||
self.offset += blob.len() as u64;
|
||||
Ok(range)
|
||||
}
|
||||
|
||||
|
||||
@@ -39,11 +39,9 @@
|
||||
//!
|
||||
use crate::layered_repository::blob::BlobWriter;
|
||||
use crate::layered_repository::filename::{DeltaFileName, PathOrConf};
|
||||
use crate::layered_repository::page_versions::PageVersions;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag,
|
||||
};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::waldecoder;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
@@ -55,6 +53,7 @@ use zenith_utils::vec_map::VecMap;
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::ops::Bound::Included;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -140,8 +139,6 @@ pub struct DeltaLayerInner {
|
||||
/// loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
book: Option<Book<VirtualFile>>,
|
||||
|
||||
/// All versions of all pages in the file are are kept here.
|
||||
/// Indexed by block number and LSN.
|
||||
page_version_metas: VecMap<(u32, Lsn), BlobRange>,
|
||||
@@ -151,10 +148,6 @@ pub struct DeltaLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for DeltaLayer {
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
@@ -184,28 +177,18 @@ impl Layer for DeltaLayer {
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
let mut need_image = true;
|
||||
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if &self.end_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Cached)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
{
|
||||
// Open the file and lock the metadata in memory
|
||||
// TODO: avoid opening the file for each read
|
||||
let (_path, book) = self.open_book()?;
|
||||
let page_version_reader = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
let inner = self.load()?;
|
||||
let page_version_reader = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.expect("should be loaded in load call above")
|
||||
.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
|
||||
// Scan the metadata BTreeMap backwards, starting from the given entry.
|
||||
let minkey = (blknum, Lsn(0));
|
||||
@@ -216,31 +199,24 @@ impl Layer for DeltaLayer {
|
||||
.iter()
|
||||
.rev();
|
||||
for ((_blknum, pv_lsn), blob_range) in iter {
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if pv_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Cached)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = PageVersion::des(&read_blob(&page_version_reader, blob_range)?)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
// Found a page image, return it
|
||||
reconstruct_data.page_img = Some(img);
|
||||
if let Some(img) = pv.page_image {
|
||||
// Found a page image, return it
|
||||
reconstruct_data.page_img = Some(img);
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec) = pv.record {
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push((*pv_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let will_init = rec.will_init;
|
||||
reconstruct_data.records.push((*pv_lsn, rec));
|
||||
if will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -250,7 +226,7 @@ impl Layer for DeltaLayer {
|
||||
// If an older page image is needed to reconstruct the page, let the
|
||||
// caller know.
|
||||
if need_image {
|
||||
Ok(PageReconstructResult::Continue(Lsn(self.start_lsn.0 - 1)))
|
||||
Ok(PageReconstructResult::Continue(self.start_lsn))
|
||||
} else {
|
||||
Ok(PageReconstructResult::Complete)
|
||||
}
|
||||
@@ -297,11 +273,6 @@ impl Layer for DeltaLayer {
|
||||
inner.page_version_metas = VecMap::default();
|
||||
inner.relsizes = VecMap::default();
|
||||
inner.loaded = false;
|
||||
|
||||
// Note: we keep the Book open. Is that a good idea? The virtual file
|
||||
// machinery has its own rules for closing the file descriptor if it's not
|
||||
// needed, but the Book struct uses up some memory, too.
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -315,10 +286,6 @@ impl Layer for DeltaLayer {
|
||||
true
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
@@ -332,11 +299,7 @@ impl Layer for DeltaLayer {
|
||||
println!(" {}: {}", k, v);
|
||||
}
|
||||
println!("--- page versions ---");
|
||||
|
||||
let path = self.path();
|
||||
let file = std::fs::File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.chapter_reader(PAGE_VERSIONS_CHAPTER)?;
|
||||
for ((blk, lsn), blob_range) in inner.page_version_metas.as_slice() {
|
||||
let mut desc = String::new();
|
||||
@@ -344,22 +307,19 @@ impl Layer for DeltaLayer {
|
||||
let buf = read_blob(&chapter, blob_range)?;
|
||||
let pv = PageVersion::des(&buf)?;
|
||||
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
rec.rec.len(),
|
||||
rec.will_init,
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
if let Some(img) = pv.page_image.as_ref() {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
if let Some(rec) = pv.record.as_ref() {
|
||||
let wal_desc = waldecoder::describe_wal_record(&rec.rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
rec.rec.len(),
|
||||
rec.will_init,
|
||||
wal_desc
|
||||
)?;
|
||||
}
|
||||
|
||||
println!(" blk {} at {}: {}", blk, lsn, desc);
|
||||
}
|
||||
|
||||
@@ -383,14 +343,14 @@ impl DeltaLayer {
|
||||
}
|
||||
|
||||
/// Create a new delta file, using the given page versions and relsizes.
|
||||
/// The page versions are passed in a PageVersions struct. If 'cutoff' is
|
||||
/// given, only page versions with LSN < cutoff are included.
|
||||
/// The page versions are passed by an iterator; the iterator must return
|
||||
/// page versions in blknum+lsn order.
|
||||
///
|
||||
/// This is used to write the in-memory layer to disk. The page_versions and
|
||||
/// relsizes are thus passed in the same format as they are in the in-memory
|
||||
/// layer, as that's expedient.
|
||||
/// This is used to write the in-memory layer to disk. The in-memory layer uses the same
|
||||
/// data structure with two btreemaps as we do, so passing the btreemaps is currently
|
||||
/// expedient.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn create(
|
||||
pub fn create<'a>(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
@@ -398,8 +358,7 @@ impl DeltaLayer {
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
dropped: bool,
|
||||
page_versions: &PageVersions,
|
||||
cutoff: Option<Lsn>,
|
||||
page_versions: impl Iterator<Item = (u32, Lsn, &'a PageVersion)>,
|
||||
relsizes: VecMap<Lsn, u32>,
|
||||
) -> Result<DeltaLayer> {
|
||||
if seg.rel.is_blocky() {
|
||||
@@ -415,33 +374,27 @@ impl DeltaLayer {
|
||||
end_lsn,
|
||||
dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
loaded: true,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes,
|
||||
}),
|
||||
};
|
||||
let mut inner = delta_layer.inner.lock().unwrap();
|
||||
|
||||
// Write the data into a file
|
||||
//
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
//
|
||||
// Write the in-memory btreemaps into a file
|
||||
let path = delta_layer.path();
|
||||
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = delta_layer.path();
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let file = File::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, DELTA_FILE_MAGIC)?;
|
||||
|
||||
let mut page_version_writer = BlobWriter::new(book, PAGE_VERSIONS_CHAPTER);
|
||||
|
||||
let page_versions_iter = page_versions.ordered_page_version_iter(cutoff);
|
||||
for (blknum, lsn, pos) in page_versions_iter {
|
||||
let blob_range =
|
||||
page_version_writer.write_blob_from_reader(&mut page_versions.reader(pos)?)?;
|
||||
for (blknum, lsn, page_version) in page_versions {
|
||||
let buf = PageVersion::ser(page_version)?;
|
||||
let blob_range = page_version_writer.write_blob(&buf)?;
|
||||
|
||||
inner
|
||||
.page_version_metas
|
||||
@@ -478,8 +431,7 @@ impl DeltaLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", &path.display());
|
||||
|
||||
@@ -488,6 +440,15 @@ impl DeltaLayer {
|
||||
Ok(delta_layer)
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = self.path();
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
Ok((path, book))
|
||||
}
|
||||
|
||||
///
|
||||
/// Load the contents of the file into memory
|
||||
///
|
||||
@@ -499,14 +460,7 @@ impl DeltaLayer {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let path = self.path();
|
||||
|
||||
// Open the file if it's not open already.
|
||||
if inner.book.is_none() {
|
||||
let file = VirtualFile::open(&path)?;
|
||||
inner.book = Some(Book::new(file)?);
|
||||
}
|
||||
let book = inner.book.as_ref().unwrap();
|
||||
let (path, book) = self.open_book()?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
@@ -541,9 +495,11 @@ impl DeltaLayer {
|
||||
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
inner.page_version_metas = page_version_metas;
|
||||
inner.relsizes = relsizes;
|
||||
inner.loaded = true;
|
||||
*inner = DeltaLayerInner {
|
||||
loaded: true,
|
||||
page_version_metas,
|
||||
relsizes,
|
||||
};
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
@@ -565,7 +521,6 @@ impl DeltaLayer {
|
||||
dropped: filename.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
}),
|
||||
@@ -575,10 +530,7 @@ impl DeltaLayer {
|
||||
/// Create a DeltaLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<Self>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
{
|
||||
pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<Self> {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
|
||||
@@ -592,7 +544,6 @@ impl DeltaLayer {
|
||||
dropped: summary.dropped,
|
||||
inner: Mutex::new(DeltaLayerInner {
|
||||
loaded: false,
|
||||
book: None,
|
||||
page_version_metas: VecMap::default(),
|
||||
relsizes: VecMap::default(),
|
||||
}),
|
||||
|
||||
@@ -1,298 +0,0 @@
|
||||
//! Implementation of append-only file data structure
|
||||
//! used to keep in-memory layers spilled on disk.
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::page_cache::{ReadBufResult, WriteBufResult};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::PageServerConf;
|
||||
use lazy_static::lazy_static;
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::{Error, ErrorKind, Seek, SeekFrom, Write};
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
lazy_static! {
|
||||
///
|
||||
/// This is the global cache of file descriptors (File objects).
|
||||
///
|
||||
static ref EPHEMERAL_FILES: RwLock<EphemeralFiles> = RwLock::new(EphemeralFiles {
|
||||
next_file_id: 1,
|
||||
files: HashMap::new(),
|
||||
});
|
||||
}
|
||||
|
||||
pub struct EphemeralFiles {
|
||||
next_file_id: u64,
|
||||
|
||||
files: HashMap<u64, Arc<VirtualFile>>,
|
||||
}
|
||||
|
||||
pub struct EphemeralFile {
|
||||
file_id: u64,
|
||||
_tenantid: ZTenantId,
|
||||
_timelineid: ZTimelineId,
|
||||
file: Arc<VirtualFile>,
|
||||
|
||||
pos: u64,
|
||||
}
|
||||
|
||||
impl EphemeralFile {
|
||||
pub fn create(
|
||||
conf: &PageServerConf,
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<EphemeralFile, std::io::Error> {
|
||||
let mut l = EPHEMERAL_FILES.write().unwrap();
|
||||
let file_id = l.next_file_id;
|
||||
l.next_file_id += 1;
|
||||
|
||||
let filename = conf
|
||||
.timeline_path(&timelineid, &tenantid)
|
||||
.join(PathBuf::from(format!("ephemeral-{}", file_id)));
|
||||
|
||||
let file = VirtualFile::open_with_options(
|
||||
&filename,
|
||||
OpenOptions::new().read(true).write(true).create(true),
|
||||
)?;
|
||||
let file_rc = Arc::new(file);
|
||||
l.files.insert(file_id, file_rc.clone());
|
||||
|
||||
Ok(EphemeralFile {
|
||||
file_id,
|
||||
_tenantid: tenantid,
|
||||
_timelineid: timelineid,
|
||||
file: file_rc,
|
||||
pos: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), Error> {
|
||||
let mut off = 0;
|
||||
while off < PAGE_SZ {
|
||||
let n = self
|
||||
.file
|
||||
.read_at(&mut buf[off..], blkno as u64 * PAGE_SZ as u64 + off as u64)?;
|
||||
|
||||
if n == 0 {
|
||||
// Reached EOF. Fill the rest of the buffer with zeros.
|
||||
const ZERO_BUF: [u8; PAGE_SZ] = [0u8; PAGE_SZ];
|
||||
|
||||
buf[off..].copy_from_slice(&ZERO_BUF[off..]);
|
||||
break;
|
||||
}
|
||||
|
||||
off += n as usize;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl FileExt for EphemeralFile {
|
||||
fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
let len = min(PAGE_SZ - off, dstbuf.len());
|
||||
|
||||
let read_guard;
|
||||
let mut write_guard;
|
||||
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache.read_ephemeral_buf(self.file_id, blkno) {
|
||||
ReadBufResult::Found(guard) => {
|
||||
read_guard = guard;
|
||||
read_guard.as_ref()
|
||||
}
|
||||
ReadBufResult::NotFound(guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
write_guard = guard;
|
||||
self.fill_buffer(write_guard.deref_mut(), blkno)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// And then fall through to read the requested slice from the
|
||||
// buffer.
|
||||
write_guard.as_ref()
|
||||
}
|
||||
};
|
||||
|
||||
dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
// Look up the right page
|
||||
let blkno = (offset / PAGE_SZ as u64) as u32;
|
||||
let off = offset as usize % PAGE_SZ;
|
||||
let len = min(PAGE_SZ - off, srcbuf.len());
|
||||
|
||||
let mut write_guard;
|
||||
let cache = page_cache::get();
|
||||
let buf = match cache.write_ephemeral_buf(self.file_id, blkno) {
|
||||
WriteBufResult::Found(guard) => {
|
||||
write_guard = guard;
|
||||
write_guard.deref_mut()
|
||||
}
|
||||
WriteBufResult::NotFound(guard) => {
|
||||
// Read the page from disk into the buffer
|
||||
// TODO: if we're overwriting the whole page, no need to read it in first
|
||||
write_guard = guard;
|
||||
self.fill_buffer(write_guard.deref_mut(), blkno)?;
|
||||
write_guard.mark_valid();
|
||||
|
||||
// And then fall through to modify it.
|
||||
write_guard.deref_mut()
|
||||
}
|
||||
};
|
||||
|
||||
buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
|
||||
write_guard.mark_dirty();
|
||||
Ok(len)
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for EphemeralFile {
|
||||
fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
|
||||
let n = self.write_at(buf, self.pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> Result<(), std::io::Error> {
|
||||
todo!()
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for EphemeralFile {
|
||||
fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(offset) => {
|
||||
self.pos = offset;
|
||||
}
|
||||
SeekFrom::End(_offset) => {
|
||||
return Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
"SeekFrom::End not supported by EphemeralFile",
|
||||
));
|
||||
}
|
||||
SeekFrom::Current(offset) => {
|
||||
let pos = self.pos as i128 + offset as i128;
|
||||
if pos < 0 {
|
||||
return Err(Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
"offset would be negative",
|
||||
));
|
||||
}
|
||||
if pos > u64::MAX as i128 {
|
||||
return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
|
||||
}
|
||||
self.pos = pos as u64;
|
||||
}
|
||||
}
|
||||
Ok(self.pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for EphemeralFile {
|
||||
fn drop(&mut self) {
|
||||
// drop all pages from page cache
|
||||
let cache = page_cache::get();
|
||||
cache.drop_buffers_for_ephemeral(self.file_id);
|
||||
|
||||
// remove entry from the hash map
|
||||
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
|
||||
|
||||
// unlink file
|
||||
// FIXME: print error
|
||||
let _ = std::fs::remove_file(&self.file.path);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
"could not write back page, not found in ephemeral files hash",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
|
||||
fn repo_harness(
|
||||
test_name: &str,
|
||||
) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId), Error> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
|
||||
|
||||
Ok((conf, tenantid, timelineid))
|
||||
}
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
// into a string
|
||||
fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, Error> {
|
||||
let mut buf = Vec::new();
|
||||
buf.resize(len, 0u8);
|
||||
|
||||
efile.read_exact_at(&mut buf, offset)?;
|
||||
|
||||
Ok(String::from_utf8_lossy(&buf)
|
||||
.trim_end_matches('\0')
|
||||
.to_string())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ephemeral_files() -> Result<(), Error> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("ephemeral_files")?;
|
||||
|
||||
let mut file_a = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
file_a.write_all(b"foo")?;
|
||||
assert_eq!("foo", read_string(&file_a, 0, 20)?);
|
||||
|
||||
file_a.write_all(b"bar")?;
|
||||
assert_eq!("foobar", read_string(&file_a, 0, 20)?);
|
||||
|
||||
// Open a lot of files, enough to cause some page evictions.
|
||||
let mut efiles = Vec::new();
|
||||
for fileno in 0..100 {
|
||||
let mut efile = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
efile.write_all(format!("file {}", fileno).as_bytes())?;
|
||||
assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
|
||||
efiles.push((fileno, efile));
|
||||
}
|
||||
|
||||
// Check that all the files can still be read from. Use them in random order for
|
||||
// good measure.
|
||||
efiles.as_mut_slice().shuffle(&mut thread_rng());
|
||||
for (fileno, efile) in efiles.iter_mut() {
|
||||
assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,7 @@ use anyhow::Result;
|
||||
use log::*;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::metadata::METADATA_FILE_NAME;
|
||||
use super::METADATA_FILE_NAME;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone)]
|
||||
@@ -292,7 +292,7 @@ pub fn list_files(
|
||||
deltafiles.push(deltafilename);
|
||||
} else if let Some(imgfilename) = ImageFileName::parse_str(fname) {
|
||||
imgfiles.push(imgfilename);
|
||||
} else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
|
||||
} else if fname == METADATA_FILE_NAME || fname == "ancestor" || fname.ends_with(".old") {
|
||||
// ignore these
|
||||
} else {
|
||||
warn!("unrecognized filename in timeline dir: {}", fname);
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
//!
|
||||
//! Global registry of open layers.
|
||||
//!
|
||||
//! Whenever a new in-memory layer is created to hold incoming WAL, it is registered
|
||||
//! in [`GLOBAL_LAYER_MAP`], so that we can keep track of the total number of
|
||||
//! in-memory layers in the system, and know when we need to evict some to release
|
||||
//! memory.
|
||||
//!
|
||||
//! Each layer is assigned a unique ID when it's registered in the global registry.
|
||||
//! The ID can be used to relocate the layer later, without having to hold locks.
|
||||
//!
|
||||
|
||||
use std::sync::atomic::{AtomicU8, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use super::inmemory_layer::InMemoryLayer;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const MAX_USAGE_COUNT: u8 = 5;
|
||||
|
||||
lazy_static! {
|
||||
pub static ref GLOBAL_LAYER_MAP: RwLock<InMemoryLayers> =
|
||||
RwLock::new(InMemoryLayers::default());
|
||||
}
|
||||
|
||||
// TODO these types can probably be smaller
|
||||
#[derive(PartialEq, Eq, Clone, Copy)]
|
||||
pub struct LayerId {
|
||||
index: usize,
|
||||
tag: u64, // to avoid ABA problem
|
||||
}
|
||||
|
||||
enum SlotData {
|
||||
Occupied(Arc<InMemoryLayer>),
|
||||
/// Vacant slots form a linked list, the value is the index
|
||||
/// of the next vacant slot in the list.
|
||||
Vacant(Option<usize>),
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
tag: u64,
|
||||
data: SlotData,
|
||||
usage_count: AtomicU8, // for clock algorithm
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct InMemoryLayers {
|
||||
slots: Vec<Slot>,
|
||||
num_occupied: usize,
|
||||
|
||||
// Head of free-slot list.
|
||||
next_empty_slot_idx: Option<usize>,
|
||||
}
|
||||
|
||||
impl InMemoryLayers {
|
||||
pub fn insert(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
let slot_idx = match self.next_empty_slot_idx {
|
||||
Some(slot_idx) => slot_idx,
|
||||
None => {
|
||||
let idx = self.slots.len();
|
||||
self.slots.push(Slot {
|
||||
tag: 0,
|
||||
data: SlotData::Vacant(None),
|
||||
usage_count: AtomicU8::new(0),
|
||||
});
|
||||
idx
|
||||
}
|
||||
};
|
||||
let slots_len = self.slots.len();
|
||||
|
||||
let slot = &mut self.slots[slot_idx];
|
||||
|
||||
match slot.data {
|
||||
SlotData::Occupied(_) => {
|
||||
panic!("an occupied slot was in the free list");
|
||||
}
|
||||
SlotData::Vacant(next_empty_slot_idx) => {
|
||||
self.next_empty_slot_idx = next_empty_slot_idx;
|
||||
}
|
||||
}
|
||||
|
||||
slot.data = SlotData::Occupied(layer);
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
self.num_occupied += 1;
|
||||
assert!(self.num_occupied <= slots_len);
|
||||
|
||||
LayerId {
|
||||
index: slot_idx,
|
||||
tag: slot.tag,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, layer_id: &LayerId) -> Option<Arc<InMemoryLayer>> {
|
||||
let slot = self.slots.get(layer_id.index)?; // TODO should out of bounds indexes just panic?
|
||||
if slot.tag != layer_id.tag {
|
||||
return None;
|
||||
}
|
||||
|
||||
if let SlotData::Occupied(layer) = &slot.data {
|
||||
let _ = slot.usage_count.fetch_update(
|
||||
Ordering::Relaxed,
|
||||
Ordering::Relaxed,
|
||||
|old_usage_count| {
|
||||
if old_usage_count < MAX_USAGE_COUNT {
|
||||
Some(old_usage_count + 1)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
},
|
||||
);
|
||||
Some(Arc::clone(layer))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
// TODO this won't be a public API in the future
|
||||
pub fn remove(&mut self, layer_id: &LayerId) {
|
||||
let slot = &mut self.slots[layer_id.index];
|
||||
|
||||
if slot.tag != layer_id.tag {
|
||||
return;
|
||||
}
|
||||
|
||||
match &slot.data {
|
||||
SlotData::Occupied(_layer) => {
|
||||
// TODO evict the layer
|
||||
}
|
||||
SlotData::Vacant(_) => unimplemented!(),
|
||||
}
|
||||
|
||||
slot.data = SlotData::Vacant(self.next_empty_slot_idx);
|
||||
self.next_empty_slot_idx = Some(layer_id.index);
|
||||
|
||||
assert!(self.num_occupied > 0);
|
||||
self.num_occupied -= 1;
|
||||
|
||||
slot.tag = slot.tag.wrapping_add(1);
|
||||
}
|
||||
}
|
||||
@@ -27,15 +27,15 @@ use crate::layered_repository::storage_layer::{
|
||||
};
|
||||
use crate::layered_repository::LayeredTimeline;
|
||||
use crate::layered_repository::RELISH_SEG_SIZE;
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::convert::TryInto;
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{Mutex, MutexGuard};
|
||||
@@ -104,8 +104,9 @@ enum ImageType {
|
||||
}
|
||||
|
||||
pub struct ImageLayerInner {
|
||||
/// If None, the 'image_type' has not been loaded into memory yet.
|
||||
book: Option<Book<VirtualFile>>,
|
||||
/// If false, the 'image_type' has not been
|
||||
/// loaded into memory yet.
|
||||
loaded: bool,
|
||||
|
||||
/// Derived from filename and bookfile chapter metadata
|
||||
image_type: ImageType,
|
||||
@@ -116,10 +117,6 @@ impl Layer for ImageLayer {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
@@ -146,20 +143,16 @@ impl Layer for ImageLayer {
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
assert!(lsn >= self.lsn);
|
||||
|
||||
match cached_img_lsn {
|
||||
Some(cached_lsn) if self.lsn <= cached_lsn => return Ok(PageReconstructResult::Cached),
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let inner = self.load()?;
|
||||
|
||||
let base_blknum = blknum % RELISH_SEG_SIZE;
|
||||
|
||||
let (_path, book) = self.open_book()?;
|
||||
|
||||
let buf = match &inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => {
|
||||
if base_blknum >= *num_blocks {
|
||||
@@ -169,23 +162,14 @@ impl Layer for ImageLayer {
|
||||
let mut buf = vec![0u8; BLOCK_SIZE];
|
||||
let offset = BLOCK_SIZE as u64 * base_blknum as u64;
|
||||
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
let chapter = book.chapter_reader(BLOCKY_IMAGES_CHAPTER)?;
|
||||
chapter.read_exact_at(&mut buf, offset)?;
|
||||
|
||||
buf
|
||||
}
|
||||
ImageType::NonBlocky => {
|
||||
ensure!(base_blknum == 0);
|
||||
inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?
|
||||
.into_vec()
|
||||
book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?.into_vec()
|
||||
}
|
||||
};
|
||||
|
||||
@@ -207,7 +191,14 @@ impl Layer for ImageLayer {
|
||||
Ok(true)
|
||||
}
|
||||
|
||||
///
|
||||
/// Release most of the memory used by this layer. If it's accessed again later,
|
||||
/// it will need to be loaded back.
|
||||
///
|
||||
fn unload(&self) -> Result<()> {
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
inner.image_type = ImageType::Blocky { num_blocks: 0 };
|
||||
inner.loaded = false;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -221,10 +212,6 @@ impl Layer for ImageLayer {
|
||||
false
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
println!(
|
||||
@@ -237,11 +224,8 @@ impl Layer for ImageLayer {
|
||||
match inner.image_type {
|
||||
ImageType::Blocky { num_blocks } => println!("({}) blocks ", num_blocks),
|
||||
ImageType::NonBlocky => {
|
||||
let chapter = inner
|
||||
.book
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
let (_path, book) = self.open_book()?;
|
||||
let chapter = book.read_chapter(NONBLOCKY_IMAGE_CHAPTER)?;
|
||||
println!("non-blocky ({} bytes)", chapter.len());
|
||||
}
|
||||
}
|
||||
@@ -289,22 +273,17 @@ impl ImageLayer {
|
||||
seg,
|
||||
lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
loaded: true,
|
||||
image_type: image_type.clone(),
|
||||
}),
|
||||
};
|
||||
let inner = layer.inner.lock().unwrap();
|
||||
|
||||
// Write the images into a file
|
||||
//
|
||||
// Note: Because we open the file in write-only mode, we cannot
|
||||
// reuse the same VirtualFile for reading later. That's why we don't
|
||||
// set inner.book here. The first read will have to re-open it.
|
||||
//
|
||||
let path = layer.path();
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = layer.path();
|
||||
let file = VirtualFile::create(&path)?;
|
||||
let file = File::create(&path)?;
|
||||
let buf_writer = BufWriter::new(file);
|
||||
let book = BookWriter::new(buf_writer, IMAGE_FILE_MAGIC)?;
|
||||
|
||||
@@ -336,8 +315,7 @@ impl ImageLayer {
|
||||
let book = chapter.close()?;
|
||||
|
||||
// This flushes the underlying 'buf_writer'.
|
||||
let writer = book.close()?;
|
||||
writer.get_ref().sync_all()?;
|
||||
book.close()?;
|
||||
|
||||
trace!("saved {}", path.display());
|
||||
|
||||
@@ -391,19 +369,11 @@ impl ImageLayer {
|
||||
// quick exit if already loaded
|
||||
let mut inner = self.inner.lock().unwrap();
|
||||
|
||||
if inner.book.is_some() {
|
||||
if inner.loaded {
|
||||
return Ok(inner);
|
||||
}
|
||||
|
||||
let path = self.path();
|
||||
let file = VirtualFile::open(&path)
|
||||
.with_context(|| format!("Failed to open virtual file '{}'", path.display()))?;
|
||||
let book = Book::new(file).with_context(|| {
|
||||
format!(
|
||||
"Failed to open virtual file '{}' as a bookfile",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
let (path, book) = self.open_book()?;
|
||||
|
||||
match &self.path_or_conf {
|
||||
PathOrConf::Conf(_) => {
|
||||
@@ -444,13 +414,22 @@ impl ImageLayer {
|
||||
debug!("loaded from {}", &path.display());
|
||||
|
||||
*inner = ImageLayerInner {
|
||||
book: Some(book),
|
||||
loaded: true,
|
||||
image_type,
|
||||
};
|
||||
|
||||
Ok(inner)
|
||||
}
|
||||
|
||||
fn open_book(&self) -> Result<(PathBuf, Book<File>)> {
|
||||
let path = self.path();
|
||||
|
||||
let file = File::open(&path)?;
|
||||
let book = Book::new(file)?;
|
||||
|
||||
Ok((path, book))
|
||||
}
|
||||
|
||||
/// Create an ImageLayer struct representing an existing file on disk
|
||||
pub fn new(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -465,7 +444,7 @@ impl ImageLayer {
|
||||
seg: filename.seg,
|
||||
lsn: filename.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
loaded: false,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
}),
|
||||
}
|
||||
@@ -474,10 +453,7 @@ impl ImageLayer {
|
||||
/// Create an ImageLayer struct representing an existing file on disk.
|
||||
///
|
||||
/// This variant is only used for debugging purposes, by the 'dump_layerfile' binary.
|
||||
pub fn new_for_path<F>(path: &Path, book: &Book<F>) -> Result<ImageLayer>
|
||||
where
|
||||
F: std::os::unix::prelude::FileExt,
|
||||
{
|
||||
pub fn new_for_path(path: &Path, book: &Book<File>) -> Result<ImageLayer> {
|
||||
let chapter = book.read_chapter(SUMMARY_CHAPTER)?;
|
||||
let summary = Summary::des(&chapter)?;
|
||||
|
||||
@@ -488,7 +464,7 @@ impl ImageLayer {
|
||||
seg: summary.seg,
|
||||
lsn: summary.lsn,
|
||||
inner: Mutex::new(ImageLayerInner {
|
||||
book: None,
|
||||
loaded: false,
|
||||
image_type: ImageType::Blocky { num_blocks: 0 },
|
||||
}),
|
||||
})
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
//! An in-memory layer stores recently received PageVersions.
|
||||
//! The page versions are held in a BTreeMap. To avoid OOM errors, the map size is limited
|
||||
//! and layers can be spilled to disk into ephemeral files.
|
||||
//!
|
||||
//! And there's another BTreeMap to track the size of the relation.
|
||||
//! An in-memory layer stores recently received page versions in memory. The page versions
|
||||
//! are held in a BTreeMap, and there's another BTreeMap to track the size of the relation.
|
||||
//!
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
use crate::layered_repository::filename::DeltaFileName;
|
||||
use crate::layered_repository::storage_layer::{
|
||||
Layer, PageReconstructData, PageReconstructResult, PageVersion, SegmentTag, RELISH_SEG_SIZE,
|
||||
@@ -15,14 +12,15 @@ use crate::layered_repository::{DeltaLayer, ImageLayer};
|
||||
use crate::repository::WALRecord;
|
||||
use crate::PageServerConf;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{ensure, Result};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::page_versions::PageVersions;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
@@ -49,7 +47,7 @@ pub struct InMemoryLayer {
|
||||
}
|
||||
|
||||
pub struct InMemoryLayerInner {
|
||||
/// Frozen layers have an exclusive end LSN.
|
||||
/// Frozen in-memory layers have an exclusive end LSN.
|
||||
/// Writes are only allowed when this is None
|
||||
end_lsn: Option<Lsn>,
|
||||
|
||||
@@ -92,9 +90,8 @@ impl InMemoryLayerInner {
|
||||
}
|
||||
|
||||
impl Layer for InMemoryLayer {
|
||||
// An in-memory layer can be spilled to disk into ephemeral file,
|
||||
// This function is used only for debugging, so we don't need to be very precise.
|
||||
// Construct a filename as if it was a delta layer.
|
||||
// An in-memory layer doesn't really have a filename as it's not stored on disk,
|
||||
// but we construct a filename as if it was a delta layer
|
||||
fn filename(&self) -> PathBuf {
|
||||
let inner = self.inner.read().unwrap();
|
||||
|
||||
@@ -116,10 +113,6 @@ impl Layer for InMemoryLayer {
|
||||
PathBuf::from(format!("inmem-{}", delta_filename))
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
|
||||
fn get_timeline_id(&self) -> ZTimelineId {
|
||||
self.timelineid
|
||||
}
|
||||
@@ -152,7 +145,6 @@ impl Layer for InMemoryLayer {
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult> {
|
||||
let mut need_image = true;
|
||||
@@ -168,29 +160,21 @@ impl Layer for InMemoryLayer {
|
||||
.get_block_lsn_range(blknum, ..=lsn)
|
||||
.iter()
|
||||
.rev();
|
||||
for (entry_lsn, pos) in iter {
|
||||
match &cached_img_lsn {
|
||||
Some(cached_lsn) if entry_lsn <= cached_lsn => {
|
||||
return Ok(PageReconstructResult::Cached)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let pv = inner.page_versions.get_page_version(*pos)?;
|
||||
match pv {
|
||||
PageVersion::Page(img) => {
|
||||
reconstruct_data.page_img = Some(img);
|
||||
for (entry_lsn, entry) in iter {
|
||||
if let Some(img) = &entry.page_image {
|
||||
reconstruct_data.page_img = Some(img.clone());
|
||||
need_image = false;
|
||||
break;
|
||||
} else if let Some(rec) = &entry.record {
|
||||
reconstruct_data.records.push((*entry_lsn, rec.clone()));
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
PageVersion::Wal(rec) => {
|
||||
reconstruct_data.records.push((*entry_lsn, rec.clone()));
|
||||
if rec.will_init {
|
||||
// This WAL record initializes the page, so no need to go further back
|
||||
need_image = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// No base image, and no WAL record. Huh?
|
||||
bail!("no page image or WAL record for requested page");
|
||||
}
|
||||
}
|
||||
// release lock on 'inner'
|
||||
@@ -231,13 +215,9 @@ impl Layer for InMemoryLayer {
|
||||
assert!(lsn >= self.start_lsn);
|
||||
|
||||
// Is the requested LSN after the segment was dropped?
|
||||
if inner.dropped {
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
} else {
|
||||
panic!("dropped in-memory layer with no end LSN");
|
||||
if let Some(end_lsn) = inner.end_lsn {
|
||||
if lsn >= end_lsn {
|
||||
return Ok(false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -255,17 +235,13 @@ impl Layer for InMemoryLayer {
|
||||
/// Nothing to do here. When you drop the last reference to the layer, it will
|
||||
/// be deallocated.
|
||||
fn delete(&self) -> Result<()> {
|
||||
panic!("can't delete an InMemoryLayer")
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_incremental(&self) -> bool {
|
||||
self.incremental
|
||||
}
|
||||
|
||||
fn is_in_memory(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
/// debugging function to print out the contents of the layer
|
||||
fn dump(&self) -> Result<()> {
|
||||
let inner = self.inner.read().unwrap();
|
||||
@@ -285,14 +261,14 @@ impl Layer for InMemoryLayer {
|
||||
println!("segsizes {}: {}", k, v);
|
||||
}
|
||||
|
||||
for (blknum, lsn, pos) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
let pv = inner.page_versions.get_page_version(pos)?;
|
||||
let pv_description = match pv {
|
||||
PageVersion::Page(_img) => "page",
|
||||
PageVersion::Wal(_rec) => "wal",
|
||||
};
|
||||
|
||||
println!("blk {} at {}: {}\n", blknum, lsn, pv_description);
|
||||
for (blknum, lsn, pv) in inner.page_versions.ordered_page_version_iter(None) {
|
||||
println!(
|
||||
"blk {} at {}: {}/{}\n",
|
||||
blknum,
|
||||
lsn,
|
||||
pv.page_image.is_some(),
|
||||
pv.record.is_some()
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -335,8 +311,6 @@ impl InMemoryLayer {
|
||||
segsizes.append(start_lsn, 0).unwrap();
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
@@ -348,7 +322,7 @@ impl InMemoryLayer {
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::new(file),
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
}),
|
||||
})
|
||||
@@ -357,18 +331,32 @@ impl InMemoryLayer {
|
||||
// Write operations
|
||||
|
||||
/// Remember new page version, as a WAL record over previous version
|
||||
pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Wal(rec))
|
||||
pub fn put_wal_record(&self, lsn: Lsn, blknum: u32, rec: WALRecord) -> u32 {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
PageVersion {
|
||||
page_image: None,
|
||||
record: Some(rec),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Remember new page version, as a full page image
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> Result<u32> {
|
||||
self.put_page_version(blknum, lsn, PageVersion::Page(img))
|
||||
pub fn put_page_image(&self, blknum: u32, lsn: Lsn, img: Bytes) -> u32 {
|
||||
self.put_page_version(
|
||||
blknum,
|
||||
lsn,
|
||||
PageVersion {
|
||||
page_image: Some(img),
|
||||
record: None,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> Result<u32> {
|
||||
pub fn put_page_version(&self, blknum: u32, lsn: Lsn, pv: PageVersion) -> u32 {
|
||||
assert!(self.seg.blknum_in_seg(blknum));
|
||||
|
||||
trace!(
|
||||
@@ -382,7 +370,7 @@ impl InMemoryLayer {
|
||||
|
||||
inner.assert_writeable();
|
||||
|
||||
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv)?;
|
||||
let old = inner.page_versions.append_or_update_last(blknum, lsn, pv);
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
@@ -418,7 +406,10 @@ impl InMemoryLayer {
|
||||
// subsequent call to initialize the gap page.
|
||||
let gapstart = self.seg.segno * RELISH_SEG_SIZE + oldsize;
|
||||
for gapblknum in gapstart..blknum {
|
||||
let zeropv = PageVersion::Page(ZERO_PAGE.clone());
|
||||
let zeropv = PageVersion {
|
||||
page_image: Some(ZERO_PAGE.clone()),
|
||||
record: None,
|
||||
};
|
||||
trace!(
|
||||
"filling gap blk {} with zeros for write of {}",
|
||||
gapblknum,
|
||||
@@ -426,7 +417,7 @@ impl InMemoryLayer {
|
||||
);
|
||||
let old = inner
|
||||
.page_versions
|
||||
.append_or_update_last(gapblknum, lsn, zeropv)?;
|
||||
.append_or_update_last(gapblknum, lsn, zeropv);
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
|
||||
if old.is_some() {
|
||||
@@ -438,11 +429,11 @@ impl InMemoryLayer {
|
||||
}
|
||||
|
||||
inner.segsizes.append_or_update_last(lsn, newsize).unwrap();
|
||||
return Ok(newsize - oldsize);
|
||||
return newsize - oldsize;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(0)
|
||||
0
|
||||
}
|
||||
|
||||
/// Remember that the relation was truncated at given LSN
|
||||
@@ -459,7 +450,7 @@ impl InMemoryLayer {
|
||||
let oldsize = inner.get_seg_size(lsn);
|
||||
assert!(segsize < oldsize);
|
||||
|
||||
let (old, _delta_size) = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
|
||||
let old = inner.segsizes.append_or_update_last(lsn, segsize).unwrap();
|
||||
|
||||
if old.is_some() {
|
||||
// We already had an entry for this LSN. That's odd..
|
||||
@@ -511,8 +502,6 @@ impl InMemoryLayer {
|
||||
segsizes.append(start_lsn, size).unwrap();
|
||||
}
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
Ok(InMemoryLayer {
|
||||
conf,
|
||||
timelineid,
|
||||
@@ -524,7 +513,7 @@ impl InMemoryLayer {
|
||||
inner: RwLock::new(InMemoryLayerInner {
|
||||
end_lsn: None,
|
||||
dropped: false,
|
||||
page_versions: PageVersions::new(file),
|
||||
page_versions: PageVersions::default(),
|
||||
segsizes,
|
||||
}),
|
||||
})
|
||||
@@ -594,8 +583,7 @@ impl InMemoryLayer {
|
||||
self.start_lsn,
|
||||
end_lsn_exclusive,
|
||||
true,
|
||||
&inner.page_versions,
|
||||
None,
|
||||
inner.page_versions.ordered_page_version_iter(None),
|
||||
inner.segsizes.clone(),
|
||||
)?;
|
||||
trace!(
|
||||
@@ -612,9 +600,13 @@ impl InMemoryLayer {
|
||||
|
||||
// Since `end_lsn` is inclusive, subtract 1.
|
||||
// We want to make an ImageLayer for the last included LSN,
|
||||
// so the DeltaLayer should exclude that LSN.
|
||||
// so the DeltaLayer should exlcude that LSN.
|
||||
let end_lsn_inclusive = Lsn(end_lsn_exclusive.0 - 1);
|
||||
|
||||
let mut page_versions = inner
|
||||
.page_versions
|
||||
.ordered_page_version_iter(Some(end_lsn_inclusive));
|
||||
|
||||
let mut delta_layers = Vec::new();
|
||||
|
||||
if self.start_lsn != end_lsn_inclusive {
|
||||
@@ -628,8 +620,7 @@ impl InMemoryLayer {
|
||||
self.start_lsn,
|
||||
end_lsn_inclusive,
|
||||
false,
|
||||
&inner.page_versions,
|
||||
Some(end_lsn_inclusive),
|
||||
page_versions,
|
||||
segsizes,
|
||||
)?;
|
||||
delta_layers.push(delta_layer);
|
||||
@@ -640,11 +631,7 @@ impl InMemoryLayer {
|
||||
end_lsn_inclusive
|
||||
);
|
||||
} else {
|
||||
assert!(inner
|
||||
.page_versions
|
||||
.ordered_page_version_iter(None)
|
||||
.next()
|
||||
.is_none());
|
||||
assert!(page_versions.next().is_none());
|
||||
}
|
||||
|
||||
drop(inner);
|
||||
|
||||
@@ -21,8 +21,6 @@ use std::sync::Arc;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use super::global_layer_map::{LayerId, GLOBAL_LAYER_MAP};
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_INMEMORY_LAYERS: IntGauge =
|
||||
register_int_gauge!("pageserver_inmemory_layers", "Number of layers in memory")
|
||||
@@ -70,9 +68,7 @@ impl LayerMap {
|
||||
pub fn get_open(&self, tag: &SegmentTag) -> Option<Arc<InMemoryLayer>> {
|
||||
let segentry = self.segs.get(tag)?;
|
||||
|
||||
segentry
|
||||
.open_layer_id
|
||||
.and_then(|layer_id| GLOBAL_LAYER_MAP.read().unwrap().get(&layer_id))
|
||||
segentry.open.as_ref().map(Arc::clone)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -81,7 +77,7 @@ impl LayerMap {
|
||||
pub fn insert_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
let segentry = self.segs.entry(layer.get_seg_tag()).or_default();
|
||||
|
||||
let layer_id = segentry.update_open(Arc::clone(&layer));
|
||||
segentry.update_open(Arc::clone(&layer));
|
||||
|
||||
let oldest_pending_lsn = layer.get_oldest_pending_lsn();
|
||||
|
||||
@@ -93,7 +89,7 @@ impl LayerMap {
|
||||
// Also add it to the binary heap
|
||||
let open_layer_entry = OpenLayerEntry {
|
||||
oldest_pending_lsn: layer.get_oldest_pending_lsn(),
|
||||
layer_id,
|
||||
layer,
|
||||
generation: self.current_generation,
|
||||
};
|
||||
self.open_layers.push(open_layer_entry);
|
||||
@@ -101,35 +97,24 @@ impl LayerMap {
|
||||
NUM_INMEMORY_LAYERS.inc();
|
||||
}
|
||||
|
||||
/// Remove an open in-memory layer
|
||||
pub fn remove_open(&mut self, layer_id: LayerId) {
|
||||
// Note: we don't try to remove the entry from the binary heap.
|
||||
// It will be removed lazily by peek_oldest_open() when it's made it to
|
||||
// the top of the heap.
|
||||
/// Remove the oldest in-memory layer
|
||||
pub fn pop_oldest_open(&mut self) {
|
||||
// Pop it from the binary heap
|
||||
let oldest_entry = self.open_layers.pop().unwrap();
|
||||
let segtag = oldest_entry.layer.get_seg_tag();
|
||||
|
||||
let layer_opt = {
|
||||
let mut global_map = GLOBAL_LAYER_MAP.write().unwrap();
|
||||
let layer_opt = global_map.get(&layer_id);
|
||||
global_map.remove(&layer_id);
|
||||
// TODO it's bad that a ref can still exist after being evicted from cache
|
||||
layer_opt
|
||||
};
|
||||
|
||||
if let Some(layer) = layer_opt {
|
||||
let mut segentry = self.segs.get_mut(&layer.get_seg_tag()).unwrap();
|
||||
|
||||
if segentry.open_layer_id == Some(layer_id) {
|
||||
// Also remove it from the SegEntry of this segment
|
||||
segentry.open_layer_id = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!layer.is_writeable());
|
||||
assert!(layer.is_dropped());
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
// Also remove it from the SegEntry of this segment
|
||||
let mut segentry = self.segs.get_mut(&segtag).unwrap();
|
||||
if Arc::ptr_eq(segentry.open.as_ref().unwrap(), &oldest_entry.layer) {
|
||||
segentry.open = None;
|
||||
} else {
|
||||
// We could have already updated segentry.open for
|
||||
// dropped (non-writeable) layer. This is fine.
|
||||
assert!(!oldest_entry.layer.is_writeable());
|
||||
assert!(oldest_entry.layer.is_dropped());
|
||||
}
|
||||
|
||||
NUM_INMEMORY_LAYERS.dec();
|
||||
}
|
||||
|
||||
///
|
||||
@@ -214,17 +199,10 @@ impl LayerMap {
|
||||
}
|
||||
|
||||
/// Return the oldest in-memory layer, along with its generation number.
|
||||
pub fn peek_oldest_open(&mut self) -> Option<(LayerId, Arc<InMemoryLayer>, u64)> {
|
||||
let global_map = GLOBAL_LAYER_MAP.read().unwrap();
|
||||
|
||||
while let Some(oldest_entry) = self.open_layers.peek() {
|
||||
if let Some(layer) = global_map.get(&oldest_entry.layer_id) {
|
||||
return Some((oldest_entry.layer_id, layer, oldest_entry.generation));
|
||||
} else {
|
||||
self.open_layers.pop();
|
||||
}
|
||||
}
|
||||
None
|
||||
pub fn peek_oldest_open(&self) -> Option<(Arc<InMemoryLayer>, u64)> {
|
||||
self.open_layers
|
||||
.peek()
|
||||
.map(|oldest_entry| (Arc::clone(&oldest_entry.layer), oldest_entry.generation))
|
||||
}
|
||||
|
||||
/// Increment the generation number used to stamp open in-memory layers. Layers
|
||||
@@ -247,12 +225,8 @@ impl LayerMap {
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
println!("Begin dump LayerMap");
|
||||
for (seg, segentry) in self.segs.iter() {
|
||||
if let Some(open) = &segentry.open_layer_id {
|
||||
if let Some(layer) = GLOBAL_LAYER_MAP.read().unwrap().get(open) {
|
||||
layer.dump()?;
|
||||
} else {
|
||||
println!("layer not found in global map");
|
||||
}
|
||||
if let Some(open) = &segentry.open {
|
||||
open.dump()?;
|
||||
}
|
||||
|
||||
for layer in segentry.historic.iter() {
|
||||
@@ -285,7 +259,7 @@ impl IntervalItem for dyn Layer {
|
||||
/// IntervalTree.
|
||||
#[derive(Default)]
|
||||
struct SegEntry {
|
||||
open_layer_id: Option<LayerId>,
|
||||
open: Option<Arc<InMemoryLayer>>,
|
||||
historic: IntervalTree<dyn Layer>,
|
||||
}
|
||||
|
||||
@@ -301,10 +275,10 @@ impl SegEntry {
|
||||
}
|
||||
|
||||
pub fn get(&self, lsn: Lsn) -> Option<Arc<dyn Layer>> {
|
||||
if let Some(open_layer_id) = &self.open_layer_id {
|
||||
let open_layer = GLOBAL_LAYER_MAP.read().unwrap().get(open_layer_id)?;
|
||||
if open_layer.get_start_lsn() <= lsn {
|
||||
return Some(open_layer);
|
||||
if let Some(open) = &self.open {
|
||||
if open.get_start_lsn() <= lsn {
|
||||
let x: Arc<dyn Layer> = Arc::clone(open) as _;
|
||||
return Some(x);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -323,16 +297,11 @@ impl SegEntry {
|
||||
// Set new open layer for a SegEntry.
|
||||
// It's ok to rewrite previous open layer,
|
||||
// but only if it is not writeable anymore.
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) -> LayerId {
|
||||
if let Some(prev_open_layer_id) = &self.open_layer_id {
|
||||
if let Some(prev_open_layer) = GLOBAL_LAYER_MAP.read().unwrap().get(prev_open_layer_id)
|
||||
{
|
||||
assert!(!prev_open_layer.is_writeable());
|
||||
}
|
||||
pub fn update_open(&mut self, layer: Arc<InMemoryLayer>) {
|
||||
if let Some(prev_open) = &self.open {
|
||||
assert!(!prev_open.is_writeable());
|
||||
}
|
||||
let open_layer_id = GLOBAL_LAYER_MAP.write().unwrap().insert(layer);
|
||||
self.open_layer_id = Some(open_layer_id);
|
||||
open_layer_id
|
||||
self.open = Some(layer);
|
||||
}
|
||||
|
||||
pub fn insert_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
@@ -347,9 +316,9 @@ impl SegEntry {
|
||||
/// recently-added entries (i.e after last call to increment_generation()) from older
|
||||
/// entries with the same 'oldest_pending_lsn'.
|
||||
struct OpenLayerEntry {
|
||||
oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
|
||||
generation: u64,
|
||||
layer_id: LayerId,
|
||||
pub oldest_pending_lsn: Lsn, // copy of layer.get_oldest_pending_lsn()
|
||||
pub generation: u64,
|
||||
pub layer: Arc<InMemoryLayer>,
|
||||
}
|
||||
impl Ord for OpenLayerEntry {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
@@ -414,13 +383,6 @@ mod tests {
|
||||
forknum: 0,
|
||||
});
|
||||
|
||||
lazy_static! {
|
||||
static ref DUMMY_TIMELINEID: ZTimelineId =
|
||||
ZTimelineId::from_str("00000000000000000000000000000000").unwrap();
|
||||
static ref DUMMY_TENANTID: ZTenantId =
|
||||
ZTenantId::from_str("00000000000000000000000000000000").unwrap();
|
||||
}
|
||||
|
||||
/// Construct a dummy InMemoryLayer for testing
|
||||
fn dummy_inmem_layer(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -431,8 +393,8 @@ mod tests {
|
||||
Arc::new(
|
||||
InMemoryLayer::create(
|
||||
conf,
|
||||
*DUMMY_TIMELINEID,
|
||||
*DUMMY_TENANTID,
|
||||
ZTimelineId::from_str("00000000000000000000000000000000").unwrap(),
|
||||
ZTenantId::from_str("00000000000000000000000000000000").unwrap(),
|
||||
SegmentTag {
|
||||
rel: TESTREL_A,
|
||||
segno,
|
||||
@@ -448,7 +410,6 @@ mod tests {
|
||||
fn test_open_layers() -> Result<()> {
|
||||
let conf = PageServerConf::dummy_conf(PageServerConf::test_repo_dir("dummy_inmem_layer"));
|
||||
let conf = Box::leak(Box::new(conf));
|
||||
std::fs::create_dir_all(conf.timeline_path(&DUMMY_TIMELINEID, &DUMMY_TENANTID))?;
|
||||
|
||||
let mut layers = LayerMap::default();
|
||||
|
||||
@@ -465,10 +426,10 @@ mod tests {
|
||||
// A helper function (closure) to pop the next oldest open entry from the layer map,
|
||||
// and assert that it is what we'd expect
|
||||
let mut assert_pop_layer = |expected_segno: u32, expected_generation: u64| {
|
||||
let (layer_id, l, generation) = layers.peek_oldest_open().unwrap();
|
||||
let (l, generation) = layers.peek_oldest_open().unwrap();
|
||||
assert!(l.get_seg_tag().segno == expected_segno);
|
||||
assert!(generation == expected_generation);
|
||||
layers.remove_open(layer_id);
|
||||
layers.pop_oldest_open();
|
||||
};
|
||||
|
||||
assert_pop_layer(0, gen1); // 0x100
|
||||
|
||||
@@ -1,226 +0,0 @@
|
||||
//! Every image of a certain timeline from [`crate::layered_repository::LayeredRepository`]
|
||||
//! has a metadata that needs to be stored persistently.
|
||||
//!
|
||||
//! Later, the file gets is used in [`crate::remote_storage::storage_sync`] as a part of
|
||||
//! external storage import and export operations.
|
||||
//!
|
||||
//! The module contains all structs and related helper methods related to timeline metadata.
|
||||
|
||||
use std::{convert::TryInto, path::PathBuf};
|
||||
|
||||
use anyhow::ensure;
|
||||
use zenith_utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
layered_repository::{METADATA_CHECKSUM_SIZE, METADATA_MAX_DATA_SIZE, METADATA_MAX_SAFE_SIZE},
|
||||
PageServerConf,
|
||||
};
|
||||
|
||||
/// The name of the metadata file pageserver creates per timeline.
|
||||
pub const METADATA_FILE_NAME: &str = "metadata";
|
||||
|
||||
/// Metadata stored on disk for each timeline
|
||||
///
|
||||
/// The fields correspond to the values we hold in memory, in LayeredTimeline.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
// This is only set if we know it. We track it in memory when the page
|
||||
// server is running, but we only track the value corresponding to
|
||||
// 'last_record_lsn', not 'disk_consistent_lsn' which can lag behind by a
|
||||
// lot. We only store it in the metadata file when we flush *all* the
|
||||
// in-memory data so that 'last_record_lsn' is the same as
|
||||
// 'disk_consistent_lsn'. That's OK, because after page server restart, as
|
||||
// soon as we reprocess at least one record, we will have a valid
|
||||
// 'prev_record_lsn' value in memory again. This is only really needed when
|
||||
// doing a clean shutdown, so that there is no more WAL beyond
|
||||
// 'disk_consistent_lsn'
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain timeline's metadata file should be located.
|
||||
pub fn metadata_path(
|
||||
conf: &'static PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
) -> PathBuf {
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(METADATA_FILE_NAME)
|
||||
}
|
||||
|
||||
impl TimelineMetadata {
|
||||
pub fn new(
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn,
|
||||
prev_record_lsn,
|
||||
ancestor_timeline,
|
||||
ancestor_lsn,
|
||||
latest_gc_cutoff_lsn,
|
||||
initdb_lsn,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_bytes(metadata_bytes: &[u8]) -> anyhow::Result<Self> {
|
||||
ensure!(
|
||||
metadata_bytes.len() == METADATA_MAX_SAFE_SIZE,
|
||||
"metadata bytes size is wrong"
|
||||
);
|
||||
|
||||
let data = &metadata_bytes[..METADATA_MAX_DATA_SIZE];
|
||||
let calculated_checksum = crc32c::crc32c(data);
|
||||
|
||||
let checksum_bytes: &[u8; METADATA_CHECKSUM_SIZE] =
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].try_into()?;
|
||||
let expected_checksum = u32::from_le_bytes(*checksum_bytes);
|
||||
ensure!(
|
||||
calculated_checksum == expected_checksum,
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
|
||||
let data = TimelineMetadata::from(serialize::DeTimelineMetadata::des_prefix(data)?);
|
||||
assert!(data.disk_consistent_lsn.is_aligned());
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
|
||||
pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
|
||||
let serializeable_metadata = serialize::SeTimelineMetadata::from(self);
|
||||
let mut metadata_bytes = serialize::SeTimelineMetadata::ser(&serializeable_metadata)?;
|
||||
assert!(metadata_bytes.len() <= METADATA_MAX_DATA_SIZE);
|
||||
metadata_bytes.resize(METADATA_MAX_SAFE_SIZE, 0u8);
|
||||
|
||||
let checksum = crc32c::crc32c(&metadata_bytes[..METADATA_MAX_DATA_SIZE]);
|
||||
metadata_bytes[METADATA_MAX_DATA_SIZE..].copy_from_slice(&u32::to_le_bytes(checksum));
|
||||
Ok(metadata_bytes)
|
||||
}
|
||||
|
||||
/// [`Lsn`] that corresponds to the corresponding timeline directory
|
||||
/// contents, stored locally in the pageserver workdir.
|
||||
pub fn disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
|
||||
pub fn prev_record_lsn(&self) -> Option<Lsn> {
|
||||
self.prev_record_lsn
|
||||
}
|
||||
|
||||
pub fn ancestor_timeline(&self) -> Option<ZTimelineId> {
|
||||
self.ancestor_timeline
|
||||
}
|
||||
|
||||
pub fn ancestor_lsn(&self) -> Lsn {
|
||||
self.ancestor_lsn
|
||||
}
|
||||
|
||||
pub fn latest_gc_cutoff_lsn(&self) -> Lsn {
|
||||
self.latest_gc_cutoff_lsn
|
||||
}
|
||||
|
||||
pub fn initdb_lsn(&self) -> Lsn {
|
||||
self.initdb_lsn
|
||||
}
|
||||
}
|
||||
|
||||
/// This module is for direct conversion of metadata to bytes and back.
|
||||
/// For a certain metadata, besides the conversion a few verification steps has to
|
||||
/// be done, so all serde derives are hidden from the user, to avoid accidental
|
||||
/// verification-less metadata creation.
|
||||
mod serialize {
|
||||
use serde::{Deserialize, Serialize};
|
||||
use zenith_utils::{lsn::Lsn, zid::ZTimelineId};
|
||||
|
||||
use super::TimelineMetadata;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct SeTimelineMetadata<'a> {
|
||||
disk_consistent_lsn: &'a Lsn,
|
||||
prev_record_lsn: &'a Option<Lsn>,
|
||||
ancestor_timeline: &'a Option<ZTimelineId>,
|
||||
ancestor_lsn: &'a Lsn,
|
||||
latest_gc_cutoff_lsn: &'a Lsn,
|
||||
initdb_lsn: &'a Lsn,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a TimelineMetadata> for SeTimelineMetadata<'a> {
|
||||
fn from(other: &'a TimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: &other.disk_consistent_lsn,
|
||||
prev_record_lsn: &other.prev_record_lsn,
|
||||
ancestor_timeline: &other.ancestor_timeline,
|
||||
ancestor_lsn: &other.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: &other.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: &other.initdb_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
pub(super) struct DeTimelineMetadata {
|
||||
disk_consistent_lsn: Lsn,
|
||||
prev_record_lsn: Option<Lsn>,
|
||||
ancestor_timeline: Option<ZTimelineId>,
|
||||
ancestor_lsn: Lsn,
|
||||
latest_gc_cutoff_lsn: Lsn,
|
||||
initdb_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl From<DeTimelineMetadata> for TimelineMetadata {
|
||||
fn from(other: DeTimelineMetadata) -> Self {
|
||||
Self {
|
||||
disk_consistent_lsn: other.disk_consistent_lsn,
|
||||
prev_record_lsn: other.prev_record_lsn,
|
||||
ancestor_timeline: other.ancestor_timeline,
|
||||
ancestor_lsn: other.ancestor_lsn,
|
||||
latest_gc_cutoff_lsn: other.latest_gc_cutoff_lsn,
|
||||
initdb_lsn: other.initdb_lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::repository::repo_harness::TIMELINE_ID;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn metadata_serializes_correctly() {
|
||||
let original_metadata = TimelineMetadata {
|
||||
disk_consistent_lsn: Lsn(0x200),
|
||||
prev_record_lsn: Some(Lsn(0x100)),
|
||||
ancestor_timeline: Some(TIMELINE_ID),
|
||||
ancestor_lsn: Lsn(0),
|
||||
latest_gc_cutoff_lsn: Lsn(0),
|
||||
initdb_lsn: Lsn(0),
|
||||
};
|
||||
|
||||
let metadata_bytes = original_metadata
|
||||
.to_bytes()
|
||||
.expect("Should serialize correct metadata to bytes");
|
||||
|
||||
let deserialized_metadata = TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.expect("Should deserialize its own bytes");
|
||||
|
||||
assert_eq!(
|
||||
deserialized_metadata, original_metadata,
|
||||
"Metadata that was serialized to bytes and deserialized back should not change"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,78 +1,40 @@
|
||||
//!
|
||||
//! Data structure to ingest incoming WAL into an append-only file.
|
||||
//!
|
||||
//! - The file is considered temporary, and will be discarded on crash
|
||||
//! - based on a B-tree
|
||||
//!
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::{collections::HashMap, ops::RangeBounds, slice};
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::io::Seek;
|
||||
|
||||
use zenith_utils::{lsn::Lsn, vec_map::VecMap};
|
||||
|
||||
use super::storage_layer::PageVersion;
|
||||
use crate::layered_repository::ephemeral_file::EphemeralFile;
|
||||
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
const EMPTY_SLICE: &[(Lsn, PageVersion)] = &[];
|
||||
|
||||
const EMPTY_SLICE: &[(Lsn, u64)] = &[];
|
||||
|
||||
pub struct PageVersions {
|
||||
map: HashMap<u32, VecMap<Lsn, u64>>,
|
||||
|
||||
/// The PageVersion structs are stored in a serialized format in this file.
|
||||
/// Each serialized PageVersion is preceded by a 'u32' length field.
|
||||
/// The 'map' stores offsets into this file.
|
||||
file: EphemeralFile,
|
||||
}
|
||||
#[derive(Debug, Default)]
|
||||
pub struct PageVersions(HashMap<u32, VecMap<Lsn, PageVersion>>);
|
||||
|
||||
impl PageVersions {
|
||||
pub fn new(file: EphemeralFile) -> PageVersions {
|
||||
PageVersions {
|
||||
map: HashMap::new(),
|
||||
file,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append_or_update_last(
|
||||
&mut self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
page_version: PageVersion,
|
||||
) -> Result<Option<u64>> {
|
||||
// remember starting position
|
||||
let pos = self.file.stream_position()?;
|
||||
|
||||
// make room for the 'length' field by writing zeros as a placeholder.
|
||||
self.file.seek(std::io::SeekFrom::Start(pos + 4)).unwrap();
|
||||
|
||||
page_version.ser_into(&mut self.file).unwrap();
|
||||
|
||||
// write the 'length' field.
|
||||
let len = self.file.stream_position()? - pos - 4;
|
||||
let lenbuf = u32::to_ne_bytes(len as u32);
|
||||
self.file.write_all_at(&lenbuf, pos)?;
|
||||
|
||||
let map = self.map.entry(blknum).or_insert_with(VecMap::default);
|
||||
Ok(map.append_or_update_last(lsn, pos as u64).unwrap().0)
|
||||
) -> Option<PageVersion> {
|
||||
let map = self.0.entry(blknum).or_insert_with(VecMap::default);
|
||||
map.append_or_update_last(lsn, page_version).unwrap()
|
||||
}
|
||||
|
||||
/// Get all [`PageVersion`]s in a block
|
||||
fn get_block_slice(&self, blknum: u32) -> &[(Lsn, u64)] {
|
||||
self.map
|
||||
pub fn get_block_slice(&self, blknum: u32) -> &[(Lsn, PageVersion)] {
|
||||
self.0
|
||||
.get(&blknum)
|
||||
.map(VecMap::as_slice)
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
}
|
||||
|
||||
/// Get a range of [`PageVersions`] in a block
|
||||
pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(&self, blknum: u32, range: R) -> &[(Lsn, u64)] {
|
||||
self.map
|
||||
pub fn get_block_lsn_range<R: RangeBounds<Lsn>>(
|
||||
&self,
|
||||
blknum: u32,
|
||||
range: R,
|
||||
) -> &[(Lsn, PageVersion)] {
|
||||
self.0
|
||||
.get(&blknum)
|
||||
.map(|vec_map| vec_map.slice_range(range))
|
||||
.unwrap_or(EMPTY_SLICE)
|
||||
@@ -81,7 +43,7 @@ impl PageVersions {
|
||||
/// Iterate through [`PageVersion`]s in (block, lsn) order.
|
||||
/// If a [`cutoff_lsn`] is set, only show versions with `lsn < cutoff_lsn`
|
||||
pub fn ordered_page_version_iter(&self, cutoff_lsn: Option<Lsn>) -> OrderedPageVersionIter<'_> {
|
||||
let mut ordered_blocks: Vec<u32> = self.map.keys().cloned().collect();
|
||||
let mut ordered_blocks: Vec<u32> = self.0.keys().cloned().collect();
|
||||
ordered_blocks.sort_unstable();
|
||||
|
||||
let slice = ordered_blocks
|
||||
@@ -97,40 +59,6 @@ impl PageVersions {
|
||||
cur_slice_iter: slice.iter(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a 'Read' that reads the page version at given offset.
|
||||
pub fn reader(&self, pos: u64) -> Result<PageVersionReader, std::io::Error> {
|
||||
// read length
|
||||
let mut lenbuf = [0u8; 4];
|
||||
self.file.read_exact_at(&mut lenbuf, pos)?;
|
||||
let len = u32::from_ne_bytes(lenbuf);
|
||||
|
||||
Ok(PageVersionReader {
|
||||
file: &self.file,
|
||||
pos: pos + 4,
|
||||
end_pos: pos + 4 + len as u64,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_page_version(&self, pos: u64) -> Result<PageVersion> {
|
||||
let mut reader = self.reader(pos)?;
|
||||
Ok(PageVersion::des_from(&mut reader)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageVersionReader<'a> {
|
||||
file: &'a EphemeralFile,
|
||||
pos: u64,
|
||||
end_pos: u64,
|
||||
}
|
||||
|
||||
impl<'a> std::io::Read for PageVersionReader<'a> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> Result<usize, std::io::Error> {
|
||||
let len = min(buf.len(), (self.end_pos - self.pos) as usize);
|
||||
let n = self.file.read_at(&mut buf[..len], self.pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OrderedPageVersionIter<'a> {
|
||||
@@ -141,7 +69,7 @@ pub struct OrderedPageVersionIter<'a> {
|
||||
|
||||
cutoff_lsn: Option<Lsn>,
|
||||
|
||||
cur_slice_iter: slice::Iter<'a, (Lsn, u64)>,
|
||||
cur_slice_iter: slice::Iter<'a, (Lsn, PageVersion)>,
|
||||
}
|
||||
|
||||
impl OrderedPageVersionIter<'_> {
|
||||
@@ -155,14 +83,14 @@ impl OrderedPageVersionIter<'_> {
|
||||
}
|
||||
|
||||
impl<'a> Iterator for OrderedPageVersionIter<'a> {
|
||||
type Item = (u32, Lsn, u64);
|
||||
type Item = (u32, Lsn, &'a PageVersion);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
if let Some((lsn, pos)) = self.cur_slice_iter.next() {
|
||||
if let Some((lsn, page_version)) = self.cur_slice_iter.next() {
|
||||
if self.is_lsn_before_cutoff(lsn) {
|
||||
let blknum = self.ordered_blocks[self.cur_block_idx];
|
||||
return Some((blknum, *lsn, *pos));
|
||||
return Some((blknum, *lsn, page_version));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,50 +104,22 @@ impl<'a> Iterator for OrderedPageVersionIter<'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bytes::Bytes;
|
||||
|
||||
use super::*;
|
||||
use crate::PageServerConf;
|
||||
use std::fs;
|
||||
use std::str::FromStr;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
fn repo_harness(test_name: &str) -> Result<(&'static PageServerConf, ZTenantId, ZTimelineId)> {
|
||||
let repo_dir = PageServerConf::test_repo_dir(test_name);
|
||||
let _ = fs::remove_dir_all(&repo_dir);
|
||||
let conf = PageServerConf::dummy_conf(repo_dir);
|
||||
// Make a static copy of the config. This can never be free'd, but that's
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenantid = ZTenantId::from_str("11000000000000000000000000000000").unwrap();
|
||||
let timelineid = ZTimelineId::from_str("22000000000000000000000000000000").unwrap();
|
||||
fs::create_dir_all(conf.timeline_path(&timelineid, &tenantid))?;
|
||||
|
||||
Ok((conf, tenantid, timelineid))
|
||||
}
|
||||
const EMPTY_PAGE_VERSION: PageVersion = PageVersion {
|
||||
page_image: None,
|
||||
record: None,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn test_ordered_iter() -> Result<()> {
|
||||
let (conf, tenantid, timelineid) = repo_harness("test_ordered_iter")?;
|
||||
|
||||
let file = EphemeralFile::create(conf, tenantid, timelineid)?;
|
||||
|
||||
let mut page_versions = PageVersions::new(file);
|
||||
|
||||
fn test_ordered_iter() {
|
||||
let mut page_versions = PageVersions::default();
|
||||
const BLOCKS: u32 = 1000;
|
||||
const LSNS: u64 = 50;
|
||||
|
||||
let empty_page = Bytes::from_static(&[0u8; 8192]);
|
||||
let empty_page_version = PageVersion::Page(empty_page);
|
||||
|
||||
for blknum in 0..BLOCKS {
|
||||
for lsn in 0..LSNS {
|
||||
let old = page_versions.append_or_update_last(
|
||||
blknum,
|
||||
Lsn(lsn),
|
||||
empty_page_version.clone(),
|
||||
)?;
|
||||
let old = page_versions.append_or_update_last(blknum, Lsn(lsn), EMPTY_PAGE_VERSION);
|
||||
assert!(old.is_none());
|
||||
}
|
||||
}
|
||||
@@ -246,7 +146,5 @@ mod tests {
|
||||
}
|
||||
assert!(iter.next().is_none());
|
||||
assert!(iter.next().is_none()); // should be robust against excessive next() calls
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
use crate::relish::RelishTag;
|
||||
use crate::repository::WALRecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::ZTimelineId;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -51,10 +51,23 @@ impl SegmentTag {
|
||||
///
|
||||
/// A page version can be stored as a full page image, or as WAL record that needs
|
||||
/// to be applied over the previous page version to reconstruct this version.
|
||||
///
|
||||
/// It's also possible to have both a WAL record and a page image in the same
|
||||
/// PageVersion. That happens if page version is originally stored as a WAL record
|
||||
/// but it is later reconstructed by a GetPage@LSN request by performing WAL
|
||||
/// redo. The get_page_at_lsn() code will store the reconstructed pag image next to
|
||||
/// the WAL record in that case. TODO: That's pretty accidental, not the result
|
||||
/// of any grand design. If we want to keep reconstructed page versions around, we
|
||||
/// probably should have a separate buffer cache so that we could control the
|
||||
/// replacement policy globally. Or if we keep a reconstructed page image, we
|
||||
/// could throw away the WAL record.
|
||||
///
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub enum PageVersion {
|
||||
Page(Bytes),
|
||||
Wal(WALRecord),
|
||||
pub struct PageVersion {
|
||||
/// an 8kb page image
|
||||
pub page_image: Option<Bytes>,
|
||||
/// WAL record to get from previous page version to this one.
|
||||
pub record: Option<WALRecord>,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -80,8 +93,6 @@ pub enum PageReconstructResult {
|
||||
/// the returned LSN. This is usually considered an error, but might be OK
|
||||
/// in some circumstances.
|
||||
Missing(Lsn),
|
||||
/// Use the cached image at `cached_img_lsn` as the base image
|
||||
Cached,
|
||||
}
|
||||
|
||||
///
|
||||
@@ -93,8 +104,6 @@ pub enum PageReconstructResult {
|
||||
/// in-memory and on-disk layers.
|
||||
///
|
||||
pub trait Layer: Send + Sync {
|
||||
fn get_tenant_id(&self) -> ZTenantId;
|
||||
|
||||
/// Identify the timeline this relish belongs to
|
||||
fn get_timeline_id(&self) -> ZTimelineId;
|
||||
|
||||
@@ -129,9 +138,6 @@ pub trait Layer: Send + Sync {
|
||||
/// of the *relish*, not the beginning of the segment. The requested
|
||||
/// 'blknum' must be covered by this segment.
|
||||
///
|
||||
/// `cached_img_lsn` should be set to a cached page image's lsn < `lsn`.
|
||||
/// This function will only return data after `cached_img_lsn`.
|
||||
///
|
||||
/// See PageReconstructResult for possible return values. The collected data
|
||||
/// is appended to reconstruct_data; the caller should pass an empty struct
|
||||
/// on first call. If this returns PageReconstructResult::Continue, look up
|
||||
@@ -141,7 +147,6 @@ pub trait Layer: Send + Sync {
|
||||
&self,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
cached_img_lsn: Option<Lsn>,
|
||||
reconstruct_data: &mut PageReconstructData,
|
||||
) -> Result<PageReconstructResult>;
|
||||
|
||||
@@ -157,9 +162,6 @@ pub trait Layer: Send + Sync {
|
||||
/// the previous non-incremental layer.
|
||||
fn is_incremental(&self) -> bool;
|
||||
|
||||
/// Returns true for layers that are represented in memory.
|
||||
fn is_in_memory(&self) -> bool;
|
||||
|
||||
/// Release memory used by this layer. There is no corresponding 'load'
|
||||
/// function, that's done implicitly when you call one of the get-functions.
|
||||
fn unload(&self) -> Result<()>;
|
||||
|
||||
@@ -2,7 +2,6 @@ use layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
@@ -13,15 +12,13 @@ pub mod basebackup;
|
||||
pub mod branches;
|
||||
pub mod http;
|
||||
pub mod layered_repository;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod relish;
|
||||
pub mod remote_storage;
|
||||
pub mod relish_storage;
|
||||
pub mod repository;
|
||||
pub mod restore_local_repo;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_threads;
|
||||
pub mod virtual_file;
|
||||
pub mod waldecoder;
|
||||
pub mod walreceiver;
|
||||
pub mod walredo;
|
||||
@@ -39,18 +36,13 @@ pub mod defaults {
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(10);
|
||||
pub const DEFAULT_CHECKPOINT_PERIOD: Duration = Duration::from_secs(1);
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(10);
|
||||
pub const DEFAULT_GC_PERIOD: Duration = Duration::from_secs(100);
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 100;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
|
||||
pub const DEFAULT_OPEN_MEM_LIMIT: usize = 128 * 1024 * 1024;
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
pub const DEFAULT_RELISH_STORAGE_MAX_CONCURRENT_SYNC_LIMITS: usize = 100;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -79,10 +71,6 @@ pub struct PageServerConf {
|
||||
pub gc_period: Duration,
|
||||
pub superuser: String,
|
||||
|
||||
pub open_mem_limit: usize,
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
|
||||
// Repository directory, relative to current working directory.
|
||||
// Normally, the page server changes the current working directory
|
||||
// to the repository, and 'workdir' is always '.'. But we don't do
|
||||
@@ -96,7 +84,7 @@ pub struct PageServerConf {
|
||||
pub auth_type: AuthType,
|
||||
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
pub relish_storage_config: Option<RelishStorageConfig>,
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
@@ -136,6 +124,10 @@ impl PageServerConf {
|
||||
self.timelines_path(tenantid).join(timelineid.to_string())
|
||||
}
|
||||
|
||||
fn ancestor_path(&self, timelineid: &ZTimelineId, tenantid: &ZTenantId) -> PathBuf {
|
||||
self.timeline_path(timelineid, tenantid).join("ancestor")
|
||||
}
|
||||
|
||||
//
|
||||
// Postgres distribution paths
|
||||
//
|
||||
@@ -161,9 +153,6 @@ impl PageServerConf {
|
||||
checkpoint_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
open_mem_limit: defaults::DEFAULT_OPEN_MEM_LIMIT,
|
||||
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
|
||||
max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
superuser: "zenith_admin".to_string(),
|
||||
@@ -171,7 +160,7 @@ impl PageServerConf {
|
||||
pg_distrib_dir: "".into(),
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
relish_storage_config: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -185,20 +174,18 @@ pub enum CheckpointConfig {
|
||||
Forced,
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
/// External relish storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between pageserver and the remote storage.
|
||||
pub max_concurrent_sync: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
pub struct RelishStorageConfig {
|
||||
/// Limits the number of concurrent sync operations between pageserver and relish storage.
|
||||
pub max_concurrent_sync: usize,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
pub storage: RelishStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
/// A kind of a relish storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum RemoteStorageKind {
|
||||
pub enum RelishStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored relish data into.
|
||||
LocalFs(PathBuf),
|
||||
|
||||
@@ -1,778 +0,0 @@
|
||||
//!
|
||||
//! Global page cache
|
||||
//!
|
||||
//! The page cache uses up most of the memory in the page server. It is shared
|
||||
//! by all tenants, and it is used to store different kinds of pages. Sharing
|
||||
//! the cache allows memory to be dynamically allocated where it's needed the
|
||||
//! most.
|
||||
//!
|
||||
//! The page cache consists of fixed-size buffers, 8 kB each to match the
|
||||
//! PostgreSQL buffer size, and a Slot struct for each buffer to contain
|
||||
//! information about what's stored in the buffer.
|
||||
//!
|
||||
//! # Locking
|
||||
//!
|
||||
//! There are two levels of locking involved: There's one lock for the "mapping"
|
||||
//! from page identifier (tenant ID, timeline ID, rel, block, LSN) to the buffer
|
||||
//! slot, and a separate lock on each slot. To read or write the contents of a
|
||||
//! slot, you must hold the lock on the slot in read or write mode,
|
||||
//! respectively. To change the mapping of a slot, i.e. to evict a page or to
|
||||
//! assign a buffer for a page, you must hold the mapping lock and the lock on
|
||||
//! the slot at the same time.
|
||||
//!
|
||||
//! Whenever you need to hold both locks simultenously, the slot lock must be
|
||||
//! acquired first. This consistent ordering avoids deadlocks. To look up a page
|
||||
//! in the cache, you would first look up the mapping, while holding the mapping
|
||||
//! lock, and then lock the slot. You must release the mapping lock in between,
|
||||
//! to obey the lock ordering and avoid deadlock.
|
||||
//!
|
||||
//! A slot can momentarily have invalid contents, even if it's already been
|
||||
//! inserted to the mapping, but you must hold the write-lock on the slot until
|
||||
//! the contents are valid. If you need to release the lock without initializing
|
||||
//! the contents, you must remove the mapping first. We make that easy for the
|
||||
//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
|
||||
//! page, the caller must explicitly call guard.mark_valid() after it has
|
||||
//! initialized it. If the guard is dropped without calling mark_valid(), the
|
||||
//! mapping is automatically removed and the slot is marked free.
|
||||
//!
|
||||
|
||||
use std::{
|
||||
collections::{hash_map::Entry, HashMap},
|
||||
convert::TryInto,
|
||||
sync::{
|
||||
atomic::{AtomicU8, AtomicUsize, Ordering},
|
||||
RwLock, RwLockReadGuard, RwLockWriteGuard,
|
||||
},
|
||||
};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::layered_repository::writeback_ephemeral_file;
|
||||
use crate::{relish::RelTag, PageServerConf};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
const TEST_PAGE_CACHE_SIZE: usize = 10;
|
||||
|
||||
///
|
||||
/// Initialize the page cache. This must be called once at page server startup.
|
||||
///
|
||||
pub fn init(conf: &'static PageServerConf) {
|
||||
if PAGE_CACHE
|
||||
.set(PageCache::new(conf.page_cache_size))
|
||||
.is_err()
|
||||
{
|
||||
panic!("page cache already initialized");
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Get a handle to the page cache.
|
||||
///
|
||||
pub fn get() -> &'static PageCache {
|
||||
//
|
||||
// In unit tests, page server startup doesn't happen and no one calls
|
||||
// page_cache::init(). Initialize it here with a tiny cache, so that the
|
||||
// page cache is usable in unit tests.
|
||||
//
|
||||
if cfg!(test) {
|
||||
PAGE_CACHE.get_or_init(|| PageCache::new(TEST_PAGE_CACHE_SIZE))
|
||||
} else {
|
||||
PAGE_CACHE.get().expect("page cache not initialized")
|
||||
}
|
||||
}
|
||||
|
||||
pub const PAGE_SZ: usize = postgres_ffi::pg_constants::BLCKSZ as usize;
|
||||
const MAX_USAGE_COUNT: u8 = 5;
|
||||
|
||||
///
|
||||
/// CacheKey uniquely identifies a "thing" to cache in the page cache.
|
||||
///
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
enum CacheKey {
|
||||
MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey,
|
||||
lsn: Lsn,
|
||||
},
|
||||
EphemeralPage {
|
||||
file_id: u64,
|
||||
blkno: u32,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
struct MaterializedPageHashKey {
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Version {
|
||||
lsn: Lsn,
|
||||
slot_idx: usize,
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
inner: RwLock<SlotInner>,
|
||||
usage_count: AtomicU8,
|
||||
}
|
||||
|
||||
struct SlotInner {
|
||||
key: Option<CacheKey>,
|
||||
buf: &'static mut [u8; PAGE_SZ],
|
||||
dirty: bool,
|
||||
}
|
||||
|
||||
impl Slot {
|
||||
/// Increment usage count on the buffer, with ceiling at MAX_USAGE_COUNT.
|
||||
fn inc_usage_count(&self) {
|
||||
let _ = self
|
||||
.usage_count
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
|
||||
if val == MAX_USAGE_COUNT {
|
||||
None
|
||||
} else {
|
||||
Some(val + 1)
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Decrement usage count on the buffer, unless it's already zero. Returns
|
||||
/// the old usage count.
|
||||
fn dec_usage_count(&self) -> u8 {
|
||||
let count_res =
|
||||
self.usage_count
|
||||
.fetch_update(Ordering::Relaxed, Ordering::Relaxed, |val| {
|
||||
if val == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(val - 1)
|
||||
}
|
||||
});
|
||||
|
||||
match count_res {
|
||||
Ok(usage_count) => usage_count,
|
||||
Err(usage_count) => usage_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PageCache {
|
||||
/// This contains the mapping from the cache key to buffer slot that currently
|
||||
/// contains the page, if any.
|
||||
///
|
||||
/// TODO: This is protected by a single lock. If that becomes a bottleneck,
|
||||
/// this HashMap can be replaced with a more concurrent version, there are
|
||||
/// plenty of such crates around.
|
||||
///
|
||||
/// If you add support for caching different kinds of objects, each object kind
|
||||
/// can have a separate mapping map, next to this field.
|
||||
materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
|
||||
ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
|
||||
|
||||
/// The actual buffers with their metadata.
|
||||
slots: Box<[Slot]>,
|
||||
|
||||
/// Index of the next candidate to evict, for the Clock replacement algorithm.
|
||||
/// This is interpreted modulo the page cache size.
|
||||
next_evict_slot: AtomicUsize,
|
||||
}
|
||||
|
||||
///
|
||||
/// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
|
||||
/// until the guard is dropped.
|
||||
///
|
||||
pub struct PageReadGuard<'i>(RwLockReadGuard<'i, SlotInner>);
|
||||
|
||||
impl std::ops::Deref for PageReadGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.buf
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// PageWriteGuard is a lease on a buffer for modifying it. The page is kept locked
|
||||
/// until the guard is dropped.
|
||||
///
|
||||
/// Counterintuitively, this is used even for a read, if the requested page is not
|
||||
/// currently found in the page cache. In that case, the caller of lock_for_read()
|
||||
/// is expected to fill in the page contents and call mark_valid(). Similarly
|
||||
/// lock_for_write() can return an invalid buffer that the caller is expected to
|
||||
/// to initialize.
|
||||
///
|
||||
pub struct PageWriteGuard<'i> {
|
||||
inner: RwLockWriteGuard<'i, SlotInner>,
|
||||
|
||||
// Are the page contents currently valid?
|
||||
valid: bool,
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for PageWriteGuard<'_> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
self.inner.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for PageWriteGuard<'_> {
|
||||
type Target = [u8; PAGE_SZ];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.inner.buf
|
||||
}
|
||||
}
|
||||
|
||||
impl PageWriteGuard<'_> {
|
||||
/// Mark that the buffer contents are now valid.
|
||||
pub fn mark_valid(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
assert!(
|
||||
!self.valid,
|
||||
"mark_valid called on a buffer that was already valid"
|
||||
);
|
||||
self.valid = true;
|
||||
}
|
||||
pub fn mark_dirty(&mut self) {
|
||||
// only ephemeral pages can be dirty ATM.
|
||||
assert!(matches!(
|
||||
self.inner.key,
|
||||
Some(CacheKey::EphemeralPage { .. })
|
||||
));
|
||||
self.inner.dirty = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PageWriteGuard<'_> {
|
||||
///
|
||||
/// If the buffer was allocated for a page that was not already in the
|
||||
/// cache, but the lock_for_read/write() caller dropped the buffer without
|
||||
/// initializing it, remove the mapping from the page cache.
|
||||
///
|
||||
fn drop(&mut self) {
|
||||
assert!(self.inner.key.is_some());
|
||||
if !self.valid {
|
||||
let self_key = self.inner.key.as_ref().unwrap();
|
||||
PAGE_CACHE.get().unwrap().remove_mapping(self_key);
|
||||
self.inner.key = None;
|
||||
self.inner.dirty = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// lock_for_read() return value
|
||||
pub enum ReadBufResult<'a> {
|
||||
Found(PageReadGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
/// lock_for_write() return value
|
||||
pub enum WriteBufResult<'a> {
|
||||
Found(PageWriteGuard<'a>),
|
||||
NotFound(PageWriteGuard<'a>),
|
||||
}
|
||||
|
||||
impl PageCache {
|
||||
//
|
||||
// Section 1.1: Public interface functions for looking up and memorizing materialized page
|
||||
// versions in the page cache
|
||||
//
|
||||
|
||||
/// Look up a materialized page version.
|
||||
///
|
||||
/// The 'lsn' is an upper bound, this will return the latest version of
|
||||
/// the given block, but not newer than 'lsn'. Returns the actual LSN of the
|
||||
/// returned page.
|
||||
pub fn lookup_materialized_page(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
) -> Option<(Lsn, PageReadGuard)> {
|
||||
let mut cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
rel_tag,
|
||||
blknum,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
|
||||
if let Some(guard) = self.try_lock_for_read(&mut cache_key) {
|
||||
if let CacheKey::MaterializedPage { hash_key: _, lsn } = cache_key {
|
||||
Some((lsn, guard))
|
||||
} else {
|
||||
panic!("unexpected key type in slot");
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Store an image of the given page in the cache.
|
||||
///
|
||||
pub fn memorize_materialized_page(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
rel_tag: RelTag,
|
||||
blknum: u32,
|
||||
lsn: Lsn,
|
||||
img: &[u8],
|
||||
) {
|
||||
let cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
rel_tag,
|
||||
blknum,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
|
||||
match self.lock_for_write(&cache_key) {
|
||||
WriteBufResult::Found(write_guard) => {
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(*write_guard == img);
|
||||
}
|
||||
WriteBufResult::NotFound(mut write_guard) => {
|
||||
write_guard.copy_from_slice(img);
|
||||
write_guard.mark_valid();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with Ephemeral pages.
|
||||
|
||||
pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> ReadBufResult {
|
||||
let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key)
|
||||
}
|
||||
|
||||
pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> WriteBufResult {
|
||||
let cache_key = CacheKey::EphemeralPage { file_id, blkno };
|
||||
|
||||
self.lock_for_write(&cache_key)
|
||||
}
|
||||
|
||||
/// Immediately drop all buffers belonging to given file, without writeback
|
||||
pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
|
||||
for slot_idx in 0..self.slots.len() {
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
if let Some(key) = &inner.key {
|
||||
match key {
|
||||
CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(key);
|
||||
inner.key = None;
|
||||
inner.dirty = false;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Section 2: Internal interface functions for lookup/update.
|
||||
//
|
||||
// To add support for a new kind of "thing" to cache, you will need
|
||||
// to add public interface routines above, and code to deal with the
|
||||
// "mappings" after this section. But the routines in this section should
|
||||
// not require changes.
|
||||
|
||||
/// Look up a page in the cache.
|
||||
///
|
||||
/// If the search criteria is not exact, *cache_key is updated with the key
|
||||
/// for exact key of the returned page. (For materialized pages, that means
|
||||
/// that the LSN in 'cache_key' is updated with the LSN of the returned page
|
||||
/// version.)
|
||||
///
|
||||
/// If no page is found, returns None and *cache_key is left unmodified.
|
||||
///
|
||||
fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
|
||||
let cache_key_orig = cache_key.clone();
|
||||
if let Some(slot_idx) = self.search_mapping(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.read().unwrap();
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
return Some(PageReadGuard(inner));
|
||||
} else {
|
||||
// search_mapping might have modified the search key; restore it.
|
||||
*cache_key = cache_key_orig;
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Return a locked buffer for given block.
|
||||
///
|
||||
/// Like try_lock_for_read(), if the search criteria is not exact and the
|
||||
/// page is already found in the cache, *cache_key is updated.
|
||||
///
|
||||
/// If the page is not found in the cache, this allocates a new buffer for
|
||||
/// it. The caller may then initialize the buffer with the contents, and
|
||||
/// call mark_valid().
|
||||
///
|
||||
/// Example usage:
|
||||
///
|
||||
/// ```ignore
|
||||
/// let cache = page_cache::get();
|
||||
///
|
||||
/// match cache.lock_for_read(&key) {
|
||||
/// ReadBufResult::Found(read_guard) => {
|
||||
/// // The page was found in cache. Use it
|
||||
/// },
|
||||
/// ReadBufResult::NotFound(write_guard) => {
|
||||
/// // The page was not found in cache. Read it from disk into the
|
||||
/// // buffer.
|
||||
/// //read_my_page_from_disk(write_guard);
|
||||
///
|
||||
/// // The buffer contents are now valid. Tell the page cache.
|
||||
/// write_guard.mark_valid();
|
||||
/// },
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
fn lock_for_read(&self, cache_key: &mut CacheKey) -> ReadBufResult {
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(read_guard) = self.try_lock_for_read(cache_key) {
|
||||
return ReadBufResult::Found(read_guard);
|
||||
}
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self.find_victim();
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return ReadBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
valid: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
/// Look up a page in the cache and lock it in write mode. If it's not
|
||||
/// found, returns None.
|
||||
///
|
||||
/// When locking a page for writing, the search criteria is always "exact".
|
||||
fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
|
||||
if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().unwrap();
|
||||
if inner.key.as_ref() == Some(cache_key) {
|
||||
slot.inc_usage_count();
|
||||
return Some(PageWriteGuard { inner, valid: true });
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Return a write-locked buffer for given block.
|
||||
///
|
||||
/// Similar to lock_for_read(), but the returned buffer is write-locked and
|
||||
/// may be modified by the caller even if it's already found in the cache.
|
||||
fn lock_for_write(&self, cache_key: &CacheKey) -> WriteBufResult {
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(write_guard) = self.try_lock_for_write(cache_key) {
|
||||
return WriteBufResult::Found(write_guard);
|
||||
}
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self.find_victim();
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
inner.dirty = false;
|
||||
slot.usage_count.store(1, Ordering::Relaxed);
|
||||
|
||||
return WriteBufResult::NotFound(PageWriteGuard {
|
||||
inner,
|
||||
valid: false,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Section 3: Mapping functions
|
||||
//
|
||||
|
||||
/// Search for a page in the cache using the given search key.
|
||||
///
|
||||
/// Returns the slot index, if any. If the search criteria is not exact,
|
||||
/// *cache_key is updated with the actual key of the found page.
|
||||
///
|
||||
/// NOTE: We don't hold any lock on the mapping on return, so the slot might
|
||||
/// get recycled for an unrelated page immediately after this function
|
||||
/// returns. The caller is responsible for re-checking that the slot still
|
||||
/// contains the page with the same key before using it.
|
||||
///
|
||||
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
Ok(version_idx) => version_idx,
|
||||
Err(0) => return None,
|
||||
Err(version_idx) => version_idx - 1,
|
||||
};
|
||||
let version = &versions[version_idx];
|
||||
*lsn = version.lsn;
|
||||
Some(version.slot_idx)
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Search for a page in the cache using the given search key.
|
||||
///
|
||||
/// Like 'search_mapping, but performs an "exact" search. Used for
|
||||
/// allocating a new buffer.
|
||||
fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
|
||||
match key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
Some(versions[version_idx].slot_idx)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let map = self.ephemeral_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Remove mapping for given key.
|
||||
///
|
||||
fn remove_mapping(&self, old_key: &CacheKey) {
|
||||
match old_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: old_hash_key,
|
||||
lsn: old_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
|
||||
let versions = old_entry.get_mut();
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
|
||||
versions.remove(version_idx);
|
||||
if versions.is_empty() {
|
||||
old_entry.remove_entry();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
panic!("could not find old key in mapping")
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
.expect("could not find old key in mapping");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Insert mapping for given key.
|
||||
///
|
||||
/// If a mapping already existed for the given key, returns the slot index
|
||||
/// of the existing mapping and leaves it untouched.
|
||||
fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
|
||||
match new_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: new_key,
|
||||
lsn: new_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
let versions = map.entry(new_key.clone()).or_default();
|
||||
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
|
||||
Ok(version_idx) => Some(versions[version_idx].slot_idx),
|
||||
Err(version_idx) => {
|
||||
versions.insert(
|
||||
version_idx,
|
||||
Version {
|
||||
lsn: *new_lsn,
|
||||
slot_idx,
|
||||
},
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
let mut map = self.ephemeral_page_map.write().unwrap();
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
Entry::Occupied(entry) => Some(*entry.get()),
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(slot_idx);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Section 4: Misc internal helpers
|
||||
//
|
||||
|
||||
/// Find a slot to evict.
|
||||
///
|
||||
/// On return, the slot is empty and write-locked.
|
||||
fn find_victim(&self) -> (usize, RwLockWriteGuard<SlotInner>) {
|
||||
let iter_limit = self.slots.len() * 2;
|
||||
let mut iters = 0;
|
||||
loop {
|
||||
let slot_idx = self.next_evict_slot.fetch_add(1, Ordering::Relaxed) % self.slots.len();
|
||||
|
||||
let slot = &self.slots[slot_idx];
|
||||
|
||||
if slot.dec_usage_count() == 0 || iters >= iter_limit {
|
||||
let mut inner = slot.inner.write().unwrap();
|
||||
|
||||
if let Some(old_key) = &inner.key {
|
||||
if inner.dirty {
|
||||
if let Err(err) = Self::writeback(old_key, inner.buf) {
|
||||
// Writing the page to disk failed.
|
||||
//
|
||||
// FIXME: What to do here, when? We could propagate the error to the
|
||||
// caller, but victim buffer is generally unrelated to the original
|
||||
// call. It can even belong to a different tenant. Currently, we
|
||||
// report the error to the log and continue the clock sweep to find
|
||||
// a different victim. But if the problem persists, the page cache
|
||||
// could fill up with dirty pages that we cannot evict, and we will
|
||||
// loop retrying the writebacks indefinitely.
|
||||
error!("writeback of buffer {:?} failed: {}", old_key, err);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// remove mapping for old buffer
|
||||
self.remove_mapping(old_key);
|
||||
inner.dirty = false;
|
||||
inner.key = None;
|
||||
}
|
||||
return (slot_idx, inner);
|
||||
}
|
||||
|
||||
iters += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: _,
|
||||
lsn: _,
|
||||
} => {
|
||||
panic!("unexpected dirty materialized page");
|
||||
}
|
||||
CacheKey::EphemeralPage { file_id, blkno } => {
|
||||
writeback_ephemeral_file(*file_id, *blkno, buf)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize a new page cache
|
||||
///
|
||||
/// This should be called only once at page server startup.
|
||||
fn new(num_pages: usize) -> Self {
|
||||
assert!(num_pages > 0, "page cache size must be > 0");
|
||||
|
||||
let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
|
||||
|
||||
let slots = page_buffer
|
||||
.chunks_exact_mut(PAGE_SZ)
|
||||
.map(|chunk| {
|
||||
let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
|
||||
|
||||
Slot {
|
||||
inner: RwLock::new(SlotInner {
|
||||
key: None,
|
||||
buf,
|
||||
dirty: false,
|
||||
}),
|
||||
usage_count: AtomicU8::new(0),
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
materialized_page_map: Default::default(),
|
||||
ephemeral_page_map: Default::default(),
|
||||
slots,
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -10,7 +10,7 @@
|
||||
// *callmemaybe <zenith timelineid> $url* -- ask pageserver to start walreceiver on $url
|
||||
//
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{anyhow, bail, ensure, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
@@ -456,11 +456,6 @@ impl PageServerHandler {
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant(tenantid, timelineid)?;
|
||||
if let Some(lsn) = lsn {
|
||||
timeline
|
||||
.check_lsn_is_in_scope(lsn)
|
||||
.context("invalid basebackup lsn")?;
|
||||
}
|
||||
|
||||
// switch client to COPYOUT
|
||||
pgb.write_message(&BeMessage::CopyOutResponse)?;
|
||||
@@ -696,7 +691,9 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.unwrap_or(Ok(self.conf.gc_horizon))?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layer_relfiles_total"),
|
||||
RowDescriptor::int8_col(b"layer_relfiles_needed_by_cutoff"),
|
||||
|
||||
@@ -3,21 +3,21 @@
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! There are a few components the storage machinery consists of:
|
||||
//! * [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`RelishStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`rust_s3`] uses AWS S3 bucket entirely as an external storage
|
||||
//!
|
||||
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
|
||||
//!
|
||||
//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_checkpoint_upload`]
|
||||
//! * public API via to interact with the external world: [`run_storage_sync_thread`] and [`schedule_timeline_upload`]
|
||||
//!
|
||||
//! Here's a schematic overview of all interactions backup and the rest of the pageserver perform:
|
||||
//! Here's a schematic overview of all interactions relish storage and the rest of the pageserver perform:
|
||||
//!
|
||||
//! +------------------------+ +--------->-------+
|
||||
//! | | - - - (init async loop) - - - -> | |
|
||||
//! | | | |
|
||||
//! | | -------------------------------> | async |
|
||||
//! | pageserver | (schedule checkpoint upload) | upload/download |
|
||||
//! | pageserver | (schedule frozen layer upload) | upload/download |
|
||||
//! | | | loop |
|
||||
//! | | <------------------------------- | |
|
||||
//! | | (register downloaded layers) | |
|
||||
@@ -29,7 +29,7 @@
|
||||
//! V
|
||||
//! +------------------------+
|
||||
//! | |
|
||||
//! | [`RemoteStorage`] impl |
|
||||
//! | [`RelishStorage`] impl |
|
||||
//! | |
|
||||
//! | pageserver assumes it |
|
||||
//! | owns exclusive write |
|
||||
@@ -56,7 +56,7 @@
|
||||
//! When the pageserver terminates, the upload loop finishes a current image sync task (if any) and exits.
|
||||
//!
|
||||
//! NOTES:
|
||||
//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
//! * pageserver assumes it has exclusive write access to the relish storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
//! (i.e. using different directories in the local filesystem external storage), but totally up to the storage implementation and not covered with the trait API.
|
||||
//!
|
||||
//! * the uploads do not happen right after pageserver startup, they are registered when
|
||||
@@ -65,7 +65,7 @@
|
||||
//!
|
||||
//! * the uploads do not happen right after the upload registration: the sync loop might be occupied with other tasks, or tasks with bigger priority could be waiting already
|
||||
//!
|
||||
//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of remote files,
|
||||
//! * all synchronization tasks (including the public API to register uploads and downloads and the sync queue management) happens on an image scale: a big set of relish files,
|
||||
//! enough to represent (and recover, if needed) a certain timeline state. On the contrary, all internal storage CRUD calls are made per reilsh file from those images.
|
||||
//! This way, the synchronization is able to download the image partially, if some state was synced before, but exposes correctly synced images only.
|
||||
|
||||
@@ -78,18 +78,15 @@ use std::{
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::io;
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
pub use self::storage_sync::schedule_timeline_checkpoint_upload;
|
||||
pub use self::storage_sync::schedule_timeline_upload;
|
||||
use self::{local_fs::LocalFs, rust_s3::S3};
|
||||
use crate::{PageServerConf, RemoteStorageKind};
|
||||
|
||||
/// Any timeline has its own id and its own tenant it belongs to,
|
||||
/// the sync processes group timelines by both for simplicity.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
|
||||
pub struct TimelineSyncId(ZTenantId, ZTimelineId);
|
||||
use crate::{
|
||||
layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
|
||||
PageServerConf, RelishStorageKind,
|
||||
};
|
||||
|
||||
/// Based on the config, initiates the remote storage connection and starts a separate thread
|
||||
/// that ensures that pageserver and the remote storage are in sync with each other.
|
||||
@@ -97,22 +94,19 @@ pub struct TimelineSyncId(ZTenantId, ZTimelineId);
|
||||
pub fn run_storage_sync_thread(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<Option<thread::JoinHandle<anyhow::Result<()>>>> {
|
||||
match &config.remote_storage_config {
|
||||
Some(storage_config) => {
|
||||
let max_concurrent_sync = storage_config.max_concurrent_sync;
|
||||
let max_sync_errors = storage_config.max_sync_errors;
|
||||
let handle = match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
|
||||
match &config.relish_storage_config {
|
||||
Some(relish_storage_config) => {
|
||||
let max_concurrent_sync = relish_storage_config.max_concurrent_sync;
|
||||
let handle = match &relish_storage_config.storage {
|
||||
RelishStorageKind::LocalFs(root) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
LocalFs::new(root.clone(), &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
),
|
||||
RemoteStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
|
||||
RelishStorageKind::AwsS3(s3_config) => storage_sync::spawn_storage_sync_thread(
|
||||
config,
|
||||
S3::new(s3_config, &config.workdir)?,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
),
|
||||
};
|
||||
handle.map(Some)
|
||||
@@ -125,43 +119,46 @@ pub fn run_storage_sync_thread(
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations with storage files.
|
||||
#[async_trait::async_trait]
|
||||
trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type StoragePath;
|
||||
trait RelishStorage: Send + Sync {
|
||||
/// A way to uniquely reference relish in the remote storage.
|
||||
type RelishStoragePath;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath>;
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf>;
|
||||
/// Gets the layered storage information about the given entry.
|
||||
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
) -> anyhow::Result<()>;
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
async fn download(
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()>;
|
||||
from: &Self::RelishStoragePath,
|
||||
// rust_s3 `get_object_stream` method requires `std::io::BufWriter` for some reason, not the async counterpart
|
||||
// that forces us to consume and return the writer to satisfy the blocking operation async wrapper requirements
|
||||
to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
async fn download_range(
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
from: &mut tokio::io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()>;
|
||||
/// Information about a certain remote storage entry.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
struct RemoteRelishInfo {
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
/// Path in the pageserver workdir where the file should go to.
|
||||
download_destination: PathBuf,
|
||||
is_metadata: bool,
|
||||
}
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
@@ -180,3 +177,147 @@ fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_ids_from_path<'a, R: std::fmt::Display>(
|
||||
path_segments: impl Iterator<Item = &'a str>,
|
||||
path_log_representation: &R,
|
||||
) -> anyhow::Result<(ZTenantId, ZTimelineId)> {
|
||||
let mut segments = path_segments.skip_while(|&segment| segment != TENANTS_SEGMENT_NAME);
|
||||
let tenants_segment = segments.next().ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no '{}' segment in the storage path '{}'",
|
||||
TENANTS_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
tenants_segment == TENANTS_SEGMENT_NAME,
|
||||
"Failed to extract '{}' segment from storage path '{}'",
|
||||
TENANTS_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
);
|
||||
let tenant_id = segments
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no tenant id in the storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?
|
||||
.parse::<ZTenantId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to parse tenant id from storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
|
||||
let timelines_segment = segments.next().ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no '{}' segment in the storage path '{}'",
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
timelines_segment == TIMELINES_SEGMENT_NAME,
|
||||
"Failed to extract '{}' segment from storage path '{}'",
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
path_log_representation
|
||||
);
|
||||
let timeline_id = segments
|
||||
.next()
|
||||
.ok_or_else(|| {
|
||||
anyhow!(
|
||||
"Found no timeline id in the storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?
|
||||
.parse::<ZTimelineId>()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to parse timeline id from storage path '{}'",
|
||||
path_log_representation
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok((tenant_id, timeline_id))
|
||||
}
|
||||
|
||||
/// A set of common test utils to share in unit tests inside the module tree.
|
||||
#[cfg(test)]
|
||||
mod test_utils {
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::ensure;
|
||||
|
||||
use crate::{
|
||||
layered_repository::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
/// Gives a timeline path with pageserver workdir stripped off.
|
||||
pub fn relative_timeline_path(harness: &RepoHarness) -> anyhow::Result<PathBuf> {
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
Ok(timeline_path
|
||||
.strip_prefix(&harness.conf.workdir)?
|
||||
.to_path_buf())
|
||||
}
|
||||
|
||||
/// Creates a path with custom tenant id in one of its segments.
|
||||
/// Useful for emulating paths with wrong ids.
|
||||
pub fn custom_tenant_id_path(
|
||||
path_with_tenant_id: &Path,
|
||||
new_tenant_id: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let mut new_path = PathBuf::new();
|
||||
let mut is_tenant_id = false;
|
||||
let mut tenant_id_replaced = false;
|
||||
for segment in path_with_tenant_id {
|
||||
match segment.to_str() {
|
||||
Some(TENANTS_SEGMENT_NAME) => is_tenant_id = true,
|
||||
Some(_tenant_id_str) if is_tenant_id => {
|
||||
is_tenant_id = false;
|
||||
new_path.push(new_tenant_id);
|
||||
tenant_id_replaced = true;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
new_path.push(segment)
|
||||
}
|
||||
|
||||
ensure!(tenant_id_replaced, "Found no tenant id segment to replace");
|
||||
Ok(new_path)
|
||||
}
|
||||
|
||||
/// Creates a path with custom timeline id in one of its segments.
|
||||
/// Useful for emulating paths with wrong ids.
|
||||
pub fn custom_timeline_id_path(
|
||||
path_with_timeline_id: &Path,
|
||||
new_timeline_id: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let mut new_path = PathBuf::new();
|
||||
let mut is_timeline_id = false;
|
||||
let mut timeline_id_replaced = false;
|
||||
for segment in path_with_timeline_id {
|
||||
match segment.to_str() {
|
||||
Some(TIMELINES_SEGMENT_NAME) => is_timeline_id = true,
|
||||
Some(_timeline_id_str) if is_timeline_id => {
|
||||
is_timeline_id = false;
|
||||
new_path.push(new_timeline_id);
|
||||
timeline_id_replaced = true;
|
||||
continue;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
new_path.push(segment)
|
||||
}
|
||||
|
||||
ensure!(
|
||||
timeline_id_replaced,
|
||||
"Found no timeline id segment to replace"
|
||||
);
|
||||
Ok(new_path)
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
# Non-implementation details
|
||||
|
||||
This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
|
||||
Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules.
|
||||
Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../relish_storage.rs) and its submodules.
|
||||
Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
|
||||
|
||||
## Approach
|
||||
@@ -28,7 +28,7 @@ As mentioned, the backup component is rather new and under development currently
|
||||
Here's the list of known compromises with comments:
|
||||
|
||||
* Remote storage model is the same as the `tenants/` directory contents of the pageserver's local workdir storage.
|
||||
This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish files and a metadata file, ~31 MB combined.
|
||||
This is relatively simple to implement, but may be costly to use in AWS S3: an initial data image contains ~782 relish file and a metadata file, ~31 MB combined.
|
||||
AWS charges per API call and for traffic either, layers are expected to be updated frequently, so this model most probably is ineffective.
|
||||
Additionally, pageservers might need to migrate images between tenants, which does not improve the situation.
|
||||
|
||||
@@ -43,6 +43,13 @@ AWS S3 returns file checksums during the `list` operation, so that can be used t
|
||||
|
||||
For now, due to this, we consider local workdir files as source of truth, not removing them ever and adjusting remote files instead, if image files mismatch.
|
||||
|
||||
* no proper retry management
|
||||
|
||||
Now, the storage sync attempts to redo the upload/download operation for the image files that failed.
|
||||
No proper task eviction or backpressure is implemented currently: the tasks will stay in the queue forever, reattempting the downloads.
|
||||
|
||||
This will be fixed when more details on the file consistency model will be agreed on.
|
||||
|
||||
* sad rust-s3 api
|
||||
|
||||
rust-s3 is not very pleasant to use:
|
||||
@@ -1,23 +1,23 @@
|
||||
//! Local filesystem acting as a remote storage.
|
||||
//! Local filesystem relish storage.
|
||||
//! Multiple pageservers can use the same "storage" of this kind by using different storage roots.
|
||||
//!
|
||||
//! This storage used in pageserver tests, but can also be used in cases when a certain persistent
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
ffi::OsStr,
|
||||
future::Future,
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
|
||||
};
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::{fs, io};
|
||||
use tracing::*;
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage};
|
||||
use super::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo};
|
||||
use crate::layered_repository::METADATA_FILE_NAME;
|
||||
|
||||
pub struct LocalFs {
|
||||
pageserver_workdir: &'static Path,
|
||||
@@ -25,7 +25,7 @@ pub struct LocalFs {
|
||||
}
|
||||
|
||||
impl LocalFs {
|
||||
/// Attempts to create local FS storage, along with its root directory.
|
||||
/// Attempts to create local FS relish storage, along with the storage root directory.
|
||||
pub fn new(root: PathBuf, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
if !root.exists() {
|
||||
std::fs::create_dir_all(&root).with_context(|| {
|
||||
@@ -56,30 +56,90 @@ impl LocalFs {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for LocalFs {
|
||||
type StoragePath = PathBuf;
|
||||
impl RelishStorage for LocalFs {
|
||||
type RelishStoragePath = PathBuf;
|
||||
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
Ok(self.root.join(
|
||||
strip_path_prefix(self.pageserver_workdir, local_path)
|
||||
.context("local path does not belong to this storage")?,
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
|
||||
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
|
||||
let is_metadata =
|
||||
storage_path.file_name().and_then(OsStr::to_str) == Some(METADATA_FILE_NAME);
|
||||
let relative_path = strip_path_prefix(&self.root, storage_path)
|
||||
.context("local path does not belong to this storage")?;
|
||||
Ok(self.pageserver_workdir.join(relative_path))
|
||||
let download_destination = self.pageserver_workdir.join(relative_path);
|
||||
let (tenant_id, timeline_id) = parse_ids_from_path(
|
||||
relative_path.iter().filter_map(|segment| segment.to_str()),
|
||||
&relative_path.display(),
|
||||
)?;
|
||||
Ok(RemoteRelishInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
download_destination,
|
||||
is_metadata,
|
||||
})
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
Ok(get_all_files(&self.root).await?.into_iter().collect())
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
&self,
|
||||
mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
from: &Self::RelishStoragePath,
|
||||
mut to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let updated_buffer = tokio::task::spawn_blocking(move || {
|
||||
let mut source = std::io::BufReader::new(
|
||||
std::fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
std::io::copy(&mut source, &mut to)
|
||||
.context("Failed to download the relish file")?;
|
||||
to.flush().context("Failed to flush the download buffer")?;
|
||||
Ok::<_, anyhow::Error>(to)
|
||||
})
|
||||
.await
|
||||
.context("Failed to spawn a blocking task")??;
|
||||
Ok(updated_buffer)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<R: io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
let target_file_path = self.resolve_in_storage(to)?;
|
||||
create_target_directory(&target_file_path).await?;
|
||||
@@ -97,128 +157,11 @@ impl RemoteStorage for LocalFs {
|
||||
})?,
|
||||
);
|
||||
|
||||
io::copy(&mut from, &mut destination)
|
||||
io::copy_buf(from, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload file to the local storage at '{}'",
|
||||
target_file_path.display()
|
||||
)
|
||||
})?;
|
||||
destination.flush().await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload file to the local storage at '{}'",
|
||||
target_file_path.display()
|
||||
)
|
||||
})?;
|
||||
.context("Failed to upload relish to local storage")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
io::copy(&mut source, to).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to download file '{}' from the local storage",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
source.flush().await?;
|
||||
Ok(())
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
ensure!(
|
||||
end_exclusive > start_inclusive,
|
||||
"Invalid range, start ({}) is bigger then end ({:?})",
|
||||
start_inclusive,
|
||||
end_exclusive
|
||||
);
|
||||
if start_inclusive == end_exclusive.saturating_sub(1) {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")?;
|
||||
match end_exclusive {
|
||||
Some(end_exclusive) => {
|
||||
io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
|
||||
}
|
||||
None => io::copy(&mut source, to).await,
|
||||
}
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download file '{}' range from the local storage",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
let file_path = self.resolve_in_storage(path)?;
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
Ok(fs::remove_file(file_path).await?)
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
@@ -258,7 +201,7 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
||||
let target_dir = match target_file_path.parent() {
|
||||
Some(parent_dir) => parent_dir,
|
||||
None => bail!(
|
||||
"File path '{}' has no parent directory",
|
||||
"Relish path '{}' has no parent directory",
|
||||
target_file_path.display()
|
||||
),
|
||||
};
|
||||
@@ -271,7 +214,9 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
relish_storage::test_utils::{
|
||||
custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
@@ -286,13 +231,13 @@ mod pure_tests {
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("file_name");
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join("relish_name");
|
||||
let expected_path = storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?);
|
||||
|
||||
assert_eq!(
|
||||
expected_path,
|
||||
storage.storage_path(&local_path).expect("Matching path should map to storage path normally"),
|
||||
"File paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
"Relish paths from pageserver workdir should be stored in local fs storage with the same path they have relative to the workdir"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -339,8 +284,8 @@ mod pure_tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("local_path_positive")?;
|
||||
fn info_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("info_positive")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
@@ -350,13 +295,16 @@ mod pure_tests {
|
||||
let name = "not a metadata";
|
||||
let local_path = repo_harness.timeline_path(&TIMELINE_ID).join(name);
|
||||
assert_eq!(
|
||||
local_path,
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: local_path.clone(),
|
||||
is_metadata: false,
|
||||
},
|
||||
storage
|
||||
.local_path(
|
||||
&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?)
|
||||
)
|
||||
.info(&storage_root.join(local_path.strip_prefix(&repo_harness.conf.workdir)?))
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
"Should be able to parse metadata out of the correctly named remote delta relish"
|
||||
);
|
||||
|
||||
let local_metadata_path = repo_harness
|
||||
@@ -364,10 +312,15 @@ mod pure_tests {
|
||||
.join(METADATA_FILE_NAME);
|
||||
let remote_metadata_path = storage.storage_path(&local_metadata_path)?;
|
||||
assert_eq!(
|
||||
local_metadata_path,
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: local_metadata_path,
|
||||
is_metadata: true,
|
||||
},
|
||||
storage
|
||||
.local_path(&remote_metadata_path)
|
||||
.expect("For a valid input, valid local path should be parsed"),
|
||||
.info(&remote_metadata_path)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
|
||||
@@ -375,30 +328,54 @@ mod pure_tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_negatives() -> anyhow::Result<()> {
|
||||
fn info_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
#[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.local_path` parameter requirements
|
||||
fn local_path_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
|
||||
match storage.local_path(storage_path) {
|
||||
Ok(wrong_path) => panic!(
|
||||
"Expected local path input {:?} to cause an error, but got file path: {:?}",
|
||||
storage_path, wrong_path,
|
||||
#[allow(clippy::ptr_arg)] // have to use &PathBuf due to `storage.info` parameter requirements
|
||||
fn storage_info_error(storage: &LocalFs, storage_path: &PathBuf) -> String {
|
||||
match storage.info(storage_path) {
|
||||
Ok(wrong_info) => panic!(
|
||||
"Expected storage path input {:?} to cause an error, but got relish info: {:?}",
|
||||
storage_path, wrong_info,
|
||||
),
|
||||
Err(e) => format!("{:?}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("local_path_negatives")?;
|
||||
let repo_harness = RepoHarness::create("info_negatives")?;
|
||||
let storage_root = PathBuf::from("somewhere").join("else");
|
||||
let storage = LocalFs {
|
||||
pageserver_workdir: &repo_harness.conf.workdir,
|
||||
root: storage_root,
|
||||
root: storage_root.clone(),
|
||||
};
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
let error_message = local_path_error(&storage, &PathBuf::from(totally_wrong_path));
|
||||
let error_message = storage_info_error(&storage, &PathBuf::from(totally_wrong_path));
|
||||
assert!(error_message.contains(totally_wrong_path));
|
||||
|
||||
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
|
||||
|
||||
let relative_relish_path =
|
||||
custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?
|
||||
.join("wrong_tenant_id_name");
|
||||
let wrong_tenant_id_path = storage_root.join(&relative_relish_path);
|
||||
let error_message = storage_info_error(&storage, &wrong_tenant_id_path);
|
||||
assert!(
|
||||
error_message.contains(relative_relish_path.to_str().unwrap()),
|
||||
"Error message '{}' does not contain the expected substring",
|
||||
error_message
|
||||
);
|
||||
|
||||
let relative_relish_path =
|
||||
custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?
|
||||
.join("wrong_timeline_id_name");
|
||||
let wrong_timeline_id_path = storage_root.join(&relative_relish_path);
|
||||
let error_message = storage_info_error(&storage, &wrong_timeline_id_path);
|
||||
assert!(
|
||||
error_message.contains(relative_relish_path.to_str().unwrap()),
|
||||
"Error message '{}' does not contain the expected substring",
|
||||
error_message
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -414,7 +391,7 @@ mod pure_tests {
|
||||
};
|
||||
|
||||
let storage_path = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&storage_path)?;
|
||||
let download_destination = dummy_storage.info(&storage_path)?.download_destination;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
@@ -427,24 +404,26 @@ mod pure_tests {
|
||||
|
||||
#[cfg(test)]
|
||||
mod fs_tests {
|
||||
use super::*;
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
use crate::{
|
||||
relish_storage::test_utils::relative_timeline_path, repository::repo_harness::RepoHarness,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
|
||||
use std::io::Write;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("upload_file")?;
|
||||
async fn upload_relish() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("upload_relish")?;
|
||||
let storage = create_storage()?;
|
||||
|
||||
let source = create_file_for_upload(
|
||||
let mut source = create_file_for_upload(
|
||||
&storage.pageserver_workdir.join("whatever"),
|
||||
"whatever_contents",
|
||||
)
|
||||
.await?;
|
||||
let target_path = PathBuf::from("/").join("somewhere").join("else");
|
||||
match storage.upload(source, &target_path).await {
|
||||
match storage.upload_relish(&mut source, &target_path).await {
|
||||
Ok(()) => panic!("Should not allow storing files with wrong target path"),
|
||||
Err(e) => {
|
||||
let message = format!("{:?}", e);
|
||||
@@ -452,22 +431,36 @@ mod fs_tests {
|
||||
assert!(message.contains("does not belong to the current storage"));
|
||||
}
|
||||
}
|
||||
assert!(storage.list().await?.is_empty());
|
||||
assert!(storage.list_relishes().await?.is_empty());
|
||||
|
||||
let target_path_1 = upload_dummy_file(&repo_harness, &storage, "upload_1").await?;
|
||||
assert_eq!(
|
||||
storage.list().await?,
|
||||
storage.list_relishes().await?,
|
||||
vec![target_path_1.clone()],
|
||||
"Should list a single file after first upload"
|
||||
);
|
||||
|
||||
let target_path_2 = upload_dummy_file(&repo_harness, &storage, "upload_2").await?;
|
||||
assert_eq!(
|
||||
list_files_sorted(&storage).await?,
|
||||
list_relishes_sorted(&storage).await?,
|
||||
vec![target_path_1.clone(), target_path_2.clone()],
|
||||
"Should list a two different files after second upload"
|
||||
);
|
||||
|
||||
// match storage.upload_relish(&mut source, &target_path_1).await {
|
||||
// Ok(()) => panic!("Should not allow reuploading storage files"),
|
||||
// Err(e) => {
|
||||
// let message = format!("{:?}", e);
|
||||
// assert!(message.contains(&target_path_1.display().to_string()));
|
||||
// assert!(message.contains("File exists"));
|
||||
// }
|
||||
// }
|
||||
assert_eq!(
|
||||
list_relishes_sorted(&storage).await?,
|
||||
vec![target_path_1, target_path_2],
|
||||
"Should list a two different files after all upload attempts"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -478,17 +471,17 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file")?;
|
||||
async fn download_relish() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_relish")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage.download(&upload_target, &mut content_bytes).await?;
|
||||
content_bytes.flush().await?;
|
||||
|
||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
||||
let contents_bytes = storage
|
||||
.download_relish(&upload_target, std::io::BufWriter::new(Vec::new()))
|
||||
.await?
|
||||
.into_inner()?;
|
||||
let contents = String::from_utf8(contents_bytes)?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
contents,
|
||||
@@ -496,7 +489,10 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage.download(&non_existing_path, &mut io::sink()).await {
|
||||
match storage
|
||||
.download_relish(&non_existing_path, std::io::BufWriter::new(Vec::new()))
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
@@ -508,128 +504,16 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file_range_positive")?;
|
||||
async fn delete_relish() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("delete_relish")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(&upload_target, 0, None, &mut full_range_bytes)
|
||||
.await?;
|
||||
full_range_bytes.flush().await?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
String::from_utf8(full_range_bytes.into_inner().into_inner())?,
|
||||
"Download full range should return the whole upload"
|
||||
);
|
||||
storage.delete_relish(&upload_target).await?;
|
||||
assert!(storage.list_relishes().await?.is_empty());
|
||||
|
||||
let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let same_byte = 1_000_000_000;
|
||||
storage
|
||||
.download_range(
|
||||
&upload_target,
|
||||
same_byte,
|
||||
Some(same_byte + 1), // exclusive end
|
||||
&mut zero_range_bytes,
|
||||
)
|
||||
.await?;
|
||||
zero_range_bytes.flush().await?;
|
||||
assert!(
|
||||
zero_range_bytes.into_inner().into_inner().is_empty(),
|
||||
"Zero byte range should not download any part of the file"
|
||||
);
|
||||
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
first_part_local,
|
||||
first_part_remote.as_slice(),
|
||||
"First part bytes should be returned when requrested"
|
||||
);
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
storage
|
||||
.download_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
second_part_remote.flush().await?;
|
||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
second_part_local,
|
||||
second_part_remote.as_slice(),
|
||||
"Second part bytes should be returned when requrested"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_file_range_negative() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_file_range_negative")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
let start = 10000;
|
||||
let end = 234;
|
||||
assert!(start > end, "Should test an incorrect range");
|
||||
match storage
|
||||
.download_range(&upload_target, start, Some(end), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("Invalid range"));
|
||||
assert!(error_string.contains(&start.to_string()));
|
||||
assert!(error_string.contains(&end.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage
|
||||
.download_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn delete_file() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("delete_file")?;
|
||||
let storage = create_storage()?;
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&repo_harness, &storage, upload_name).await?;
|
||||
|
||||
storage.delete(&upload_target).await?;
|
||||
assert!(storage.list().await?.is_empty());
|
||||
|
||||
match storage.delete(&upload_target).await {
|
||||
match storage.delete_relish(&upload_target).await {
|
||||
Ok(()) => panic!("Should not allow deleting non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
@@ -645,12 +529,13 @@ mod fs_tests {
|
||||
storage: &LocalFs,
|
||||
name: &str,
|
||||
) -> anyhow::Result<PathBuf> {
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
|
||||
let storage_path = storage.root.join(relative_timeline_path).join(name);
|
||||
let storage_path = storage
|
||||
.root
|
||||
.join(relative_timeline_path(harness)?)
|
||||
.join(name);
|
||||
storage
|
||||
.upload(
|
||||
create_file_for_upload(
|
||||
.upload_relish(
|
||||
&mut create_file_for_upload(
|
||||
&storage.pageserver_workdir.join(name),
|
||||
&dummy_contents(name),
|
||||
)
|
||||
@@ -681,9 +566,9 @@ mod fs_tests {
|
||||
format!("contents for {}", name)
|
||||
}
|
||||
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut files = storage.list().await?;
|
||||
files.sort();
|
||||
Ok(files)
|
||||
async fn list_relishes_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
|
||||
let mut relishes = storage.list_relishes().await?;
|
||||
relishes.sort();
|
||||
Ok(relishes)
|
||||
}
|
||||
}
|
||||
@@ -1,15 +1,18 @@
|
||||
//! AWS S3 storage wrapper around `rust_s3` library.
|
||||
//! Currently does not allow multiple pageservers to use the same bucket concurrently: objects are
|
||||
//! AWS S3 relish storage wrapper around `rust_s3` library.
|
||||
//! Currently does not allow multiple pageservers to use the same bucket concurrently: relishes are
|
||||
//! placed in the root of the bucket.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::{
|
||||
io::Write,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use s3::{bucket::Bucket, creds::Credentials, region::Region};
|
||||
use tokio::io::{self, AsyncWriteExt};
|
||||
|
||||
use crate::{
|
||||
remote_storage::{strip_path_prefix, RemoteStorage},
|
||||
layered_repository::METADATA_FILE_NAME,
|
||||
relish_storage::{parse_ids_from_path, strip_path_prefix, RelishStorage, RemoteRelishInfo},
|
||||
S3Config,
|
||||
};
|
||||
|
||||
@@ -28,14 +31,14 @@ impl S3ObjectKey {
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 storage.
|
||||
/// AWS S3 relish storage.
|
||||
pub struct S3 {
|
||||
pageserver_workdir: &'static Path,
|
||||
bucket: Bucket,
|
||||
}
|
||||
|
||||
impl S3 {
|
||||
/// Creates the storage, errors if incorrect AWS S3 configuration provided.
|
||||
/// Creates the relish storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
let region = aws_config
|
||||
.bucket_region
|
||||
@@ -62,10 +65,10 @@ impl S3 {
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
impl RemoteStorage for S3 {
|
||||
type StoragePath = S3ObjectKey;
|
||||
impl RelishStorage for S3 {
|
||||
type RelishStoragePath = S3ObjectKey;
|
||||
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::StoragePath> {
|
||||
fn storage_path(&self, local_path: &Path) -> anyhow::Result<Self::RelishStoragePath> {
|
||||
let relative_path = strip_path_prefix(self.pageserver_workdir, local_path)?;
|
||||
let mut key = String::new();
|
||||
for segment in relative_path {
|
||||
@@ -75,11 +78,22 @@ impl RemoteStorage for S3 {
|
||||
Ok(S3ObjectKey(key))
|
||||
}
|
||||
|
||||
fn local_path(&self, storage_path: &Self::StoragePath) -> anyhow::Result<PathBuf> {
|
||||
Ok(storage_path.download_destination(self.pageserver_workdir))
|
||||
fn info(&self, storage_path: &Self::RelishStoragePath) -> anyhow::Result<RemoteRelishInfo> {
|
||||
let storage_path_key = &storage_path.0;
|
||||
let is_metadata =
|
||||
storage_path_key.ends_with(&format!("{}{}", S3_FILE_SEPARATOR, METADATA_FILE_NAME));
|
||||
let download_destination = storage_path.download_destination(self.pageserver_workdir);
|
||||
let (tenant_id, timeline_id) =
|
||||
parse_ids_from_path(storage_path_key.split(S3_FILE_SEPARATOR), storage_path_key)?;
|
||||
Ok(RemoteRelishInfo {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
download_destination,
|
||||
is_metadata,
|
||||
})
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::StoragePath>> {
|
||||
async fn list_relishes(&self) -> anyhow::Result<Vec<Self::RelishStoragePath>> {
|
||||
let list_response = self
|
||||
.bucket
|
||||
.list(String::new(), None)
|
||||
@@ -93,93 +107,32 @@ impl RemoteStorage for S3 {
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
async fn download_relish<W: 'static + std::io::Write + Send>(
|
||||
&self,
|
||||
mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut upload_contents = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(&mut from, &mut upload_contents)
|
||||
.await
|
||||
.context("Failed to read the upload contents")?;
|
||||
upload_contents
|
||||
.flush()
|
||||
.await
|
||||
.context("Failed to read the upload contents")?;
|
||||
let upload_contents = upload_contents.into_inner().into_inner();
|
||||
|
||||
let (_, code) = self
|
||||
from: &Self::RelishStoragePath,
|
||||
mut to: std::io::BufWriter<W>,
|
||||
) -> anyhow::Result<std::io::BufWriter<W>> {
|
||||
let code = self
|
||||
.bucket
|
||||
.put_object(to.key(), &upload_contents)
|
||||
.await
|
||||
.with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during creating object with key '{}', code: {}",
|
||||
to.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
let (data, code) = self
|
||||
.bucket
|
||||
.get_object(from.key())
|
||||
.get_object_stream(from.key(), &mut to)
|
||||
.await
|
||||
.with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during downloading object, code: {}",
|
||||
"Received non-200 exit code during downloading object from directory, code: {}",
|
||||
code
|
||||
))
|
||||
} else {
|
||||
// we don't have to write vector into the destination this way, `to_write_all` would be enough.
|
||||
// but we want to prepare for migration on `rusoto`, that has a streaming HTTP body instead here, with
|
||||
// which it makes more sense to use `io::copy`.
|
||||
io::copy(&mut data.as_slice(), to)
|
||||
.await
|
||||
.context("Failed to write downloaded data into the destination buffer")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_range(
|
||||
&self,
|
||||
from: &Self::StoragePath,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<()> {
|
||||
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
||||
// and needs both ends to be exclusive
|
||||
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
|
||||
let (data, code) = self
|
||||
.bucket
|
||||
.get_object_range(from.key(), start_inclusive, end_inclusive)
|
||||
tokio::task::spawn_blocking(move || {
|
||||
to.flush().context("Failed to flush the download buffer")?;
|
||||
Ok::<_, anyhow::Error>(to)
|
||||
})
|
||||
.await
|
||||
.with_context(|| format!("Failed to download s3 object with key {}", from.key()))?;
|
||||
if code != 206 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-206 exit code during downloading object range, code: {}",
|
||||
code
|
||||
))
|
||||
} else {
|
||||
// see `download` function above for the comment on why `Vec<u8>` buffer is copied this way
|
||||
io::copy(&mut data.as_slice(), to)
|
||||
.await
|
||||
.context("Failed to write downloaded range into the destination buffer")?;
|
||||
Ok(())
|
||||
.context("Failed to join the download buffer flush task")?
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
async fn delete_relish(&self, path: &Self::RelishStoragePath) -> anyhow::Result<()> {
|
||||
let (_, code) = self
|
||||
.bucket
|
||||
.delete_object(path.key())
|
||||
@@ -195,12 +148,35 @@ impl RemoteStorage for S3 {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
async fn upload_relish<R: tokio::io::AsyncRead + std::marker::Unpin + Send>(
|
||||
&self,
|
||||
from: &mut tokio::io::BufReader<R>,
|
||||
to: &Self::RelishStoragePath,
|
||||
) -> anyhow::Result<()> {
|
||||
let code = self
|
||||
.bucket
|
||||
.put_object_stream(from, to.key())
|
||||
.await
|
||||
.with_context(|| format!("Failed to create s3 object with key {}", to.key()))?;
|
||||
if code != 200 {
|
||||
Err(anyhow::format_err!(
|
||||
"Received non-200 exit code during creating object with key '{}', code: {}",
|
||||
to.key(),
|
||||
code
|
||||
))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
layered_repository::metadata::METADATA_FILE_NAME,
|
||||
relish_storage::test_utils::{
|
||||
custom_tenant_id_path, custom_timeline_id_path, relative_timeline_path,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
@@ -237,7 +213,7 @@ mod tests {
|
||||
let repo_harness = RepoHarness::create("storage_path_positive")?;
|
||||
|
||||
let segment_1 = "matching";
|
||||
let segment_2 = "file";
|
||||
let segment_2 = "relish";
|
||||
let local_path = &repo_harness.conf.workdir.join(segment_1).join(segment_2);
|
||||
let expected_key = S3ObjectKey(format!(
|
||||
"{SEPARATOR}{}{SEPARATOR}{}",
|
||||
@@ -302,26 +278,35 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn local_path_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("local_path_positive")?;
|
||||
fn info_positive() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("info_positive")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_dir.strip_prefix(&repo_harness.conf.workdir)?;
|
||||
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
|
||||
|
||||
let s3_key = create_s3_key(&relative_timeline_path.join("not a metadata"));
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
is_metadata: false,
|
||||
},
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.info(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote delta file"
|
||||
"Should be able to parse metadata out of the correctly named remote delta relish"
|
||||
);
|
||||
|
||||
let s3_key = create_s3_key(&relative_timeline_path.join(METADATA_FILE_NAME));
|
||||
assert_eq!(
|
||||
s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
RemoteRelishInfo {
|
||||
tenant_id: repo_harness.tenant_id,
|
||||
timeline_id: TIMELINE_ID,
|
||||
download_destination: s3_key.download_destination(&repo_harness.conf.workdir),
|
||||
is_metadata: true,
|
||||
},
|
||||
storage
|
||||
.local_path(&s3_key)
|
||||
.info(&s3_key)
|
||||
.expect("For a valid input, valid S3 info should be parsed"),
|
||||
"Should be able to parse metadata out of the correctly named remote metadata file"
|
||||
);
|
||||
@@ -329,6 +314,43 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn info_negatives() -> anyhow::Result<()> {
|
||||
#[track_caller]
|
||||
fn storage_info_error(storage: &S3, s3_key: &S3ObjectKey) -> String {
|
||||
match storage.info(s3_key) {
|
||||
Ok(wrong_info) => panic!(
|
||||
"Expected key {:?} to error, but got relish info: {:?}",
|
||||
s3_key, wrong_info,
|
||||
),
|
||||
Err(e) => e.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
let repo_harness = RepoHarness::create("info_negatives")?;
|
||||
let storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
let relative_timeline_path = relative_timeline_path(&repo_harness)?;
|
||||
|
||||
let totally_wrong_path = "wrong_wrong_wrong";
|
||||
let error_message =
|
||||
storage_info_error(&storage, &S3ObjectKey(totally_wrong_path.to_string()));
|
||||
assert!(error_message.contains(totally_wrong_path));
|
||||
|
||||
let wrong_tenant_id = create_s3_key(
|
||||
&custom_tenant_id_path(&relative_timeline_path, "wrong_tenant_id")?.join("name"),
|
||||
);
|
||||
let error_message = storage_info_error(&storage, &wrong_tenant_id);
|
||||
assert!(error_message.contains(&wrong_tenant_id.0));
|
||||
|
||||
let wrong_timeline_id = create_s3_key(
|
||||
&custom_timeline_id_path(&relative_timeline_path, "wrong_timeline_id")?.join("name"),
|
||||
);
|
||||
let error_message = storage_info_error(&storage, &wrong_timeline_id);
|
||||
assert!(error_message.contains(&wrong_timeline_id.0));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination_matches_original_path() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("download_destination_matches_original_path")?;
|
||||
@@ -337,7 +359,7 @@ mod tests {
|
||||
let dummy_storage = dummy_storage(&repo_harness.conf.workdir);
|
||||
|
||||
let key = dummy_storage.storage_path(&original_path)?;
|
||||
let download_destination = dummy_storage.local_path(&key)?;
|
||||
let download_destination = dummy_storage.info(&key)?.download_destination;
|
||||
|
||||
assert_eq!(
|
||||
original_path, download_destination,
|
||||
@@ -359,9 +381,9 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn create_s3_key(relative_file_path: &Path) -> S3ObjectKey {
|
||||
fn create_s3_key(relative_relish_path: &Path) -> S3ObjectKey {
|
||||
S3ObjectKey(
|
||||
relative_file_path
|
||||
relative_relish_path
|
||||
.iter()
|
||||
.fold(String::new(), |mut path_string, segment| {
|
||||
path_string.push(S3_FILE_SEPARATOR);
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,7 +1,7 @@
|
||||
use crate::relish::*;
|
||||
use crate::CheckpointConfig;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::ops::{AddAssign, Deref};
|
||||
@@ -16,19 +16,11 @@ use zenith_utils::zid::ZTimelineId;
|
||||
pub trait Repository: Send + Sync {
|
||||
fn shutdown(&self) -> Result<()>;
|
||||
|
||||
/// Stops all timeline-related process in the repository and removes the timeline data from memory.
|
||||
fn unload_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
fn get_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Create a new, empty timeline. The caller is responsible for loading data into it
|
||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timelineid: ZTimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Result<Arc<dyn Timeline>>;
|
||||
fn create_empty_timeline(&self, timelineid: ZTimelineId) -> Result<Arc<dyn Timeline>>;
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
@@ -132,9 +124,6 @@ pub trait Timeline: Send + Sync {
|
||||
/// Get a list of all existing non-relational objects
|
||||
fn list_nonrels(&self, lsn: Lsn) -> Result<HashSet<RelishTag>>;
|
||||
|
||||
/// Get the ancestor's timeline id
|
||||
fn get_ancestor_timeline_id(&self) -> Option<ZTimelineId>;
|
||||
|
||||
/// Get the LSN where this branch was created
|
||||
fn get_ancestor_lsn(&self) -> Lsn;
|
||||
|
||||
@@ -162,10 +151,6 @@ pub trait Timeline: Send + Sync {
|
||||
/// know anything about them here in the repository.
|
||||
fn checkpoint(&self, cconf: CheckpointConfig) -> Result<()>;
|
||||
|
||||
///
|
||||
/// Check that it is valid to request operations with that lsn.
|
||||
fn check_lsn_is_in_scope(&self, lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Retrieve current logical size of the timeline
|
||||
///
|
||||
/// NOTE: counted incrementally, includes ancestors,
|
||||
@@ -175,9 +160,6 @@ pub trait Timeline: Send + Sync {
|
||||
/// Does the same as get_current_logical_size but counted on demand.
|
||||
/// Used in tests to ensure thet incremental and non incremental variants match.
|
||||
fn get_current_logical_size_non_incremental(&self, lsn: Lsn) -> Result<usize>;
|
||||
|
||||
/// An escape hatch to allow "casting" a generic Timeline to LayeredTimeline.
|
||||
fn upgrade_to_layered_timeline(&self) -> &crate::layered_repository::LayeredTimeline;
|
||||
}
|
||||
|
||||
/// Various functions to mutate the timeline.
|
||||
@@ -217,9 +199,28 @@ pub struct WALRecord {
|
||||
pub main_data_offset: u32,
|
||||
}
|
||||
|
||||
impl WALRecord {
|
||||
pub fn pack(&self, buf: &mut BytesMut) {
|
||||
buf.put_u8(self.will_init as u8);
|
||||
buf.put_u32(self.main_data_offset);
|
||||
buf.put_u32(self.rec.len() as u32);
|
||||
buf.put_slice(&self.rec[..]);
|
||||
}
|
||||
pub fn unpack(buf: &mut Bytes) -> WALRecord {
|
||||
let will_init = buf.get_u8() != 0;
|
||||
let main_data_offset = buf.get_u32();
|
||||
let rec_len = buf.get_u32() as usize;
|
||||
let rec = buf.split_to(rec_len);
|
||||
WALRecord {
|
||||
will_init,
|
||||
rec,
|
||||
main_data_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod repo_harness {
|
||||
use bytes::BytesMut;
|
||||
use std::{fs, path::PathBuf};
|
||||
|
||||
use crate::{
|
||||
@@ -322,21 +323,18 @@ pub mod repo_harness {
|
||||
#[allow(clippy::bool_assert_comparison)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::layered_repository::metadata::METADATA_FILE_NAME;
|
||||
|
||||
use super::repo_harness::*;
|
||||
use super::*;
|
||||
use crate::layered_repository::METADATA_FILE_NAME;
|
||||
use postgres_ffi::{pg_constants, xlog_utils::SIZEOF_CHECKPOINT};
|
||||
use std::fs;
|
||||
|
||||
/// Arbitrary relation tag, for testing.
|
||||
const TESTREL_A_REL_TAG: RelTag = RelTag {
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
relnode: 1000,
|
||||
forknum: 0,
|
||||
};
|
||||
const TESTREL_A: RelishTag = RelishTag::Relation(TESTREL_A_REL_TAG);
|
||||
});
|
||||
const TESTREL_B: RelishTag = RelishTag::Relation(RelTag {
|
||||
spcnode: 0,
|
||||
dbnode: 111,
|
||||
@@ -362,7 +360,7 @@ mod tests {
|
||||
//repo.get_timeline("11223344556677881122334455667788");
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
@@ -480,7 +478,7 @@ mod tests {
|
||||
let repo = RepoHarness::create("test_drop_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
writer.put_page_image(TESTREL_A, 0, Lsn(0x20), TEST_IMG("foo blk 0 at 2"))?;
|
||||
@@ -517,7 +515,7 @@ mod tests {
|
||||
let repo = RepoHarness::create("test_truncate_extend")?.load();
|
||||
|
||||
// Create timeline to work on
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
//from storage_layer.rs
|
||||
@@ -616,7 +614,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_large_rel() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_large_rel")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
let mut lsn = 0x10;
|
||||
@@ -679,7 +677,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_list_rels_drop() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_list_rels_drop")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
const TESTDB: u32 = 111;
|
||||
|
||||
@@ -737,7 +735,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_branch() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_branch")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
let writer = tline.writer();
|
||||
|
||||
// Import initial dummy checkpoint record, otherwise the get_timeline() call
|
||||
@@ -786,175 +784,13 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn make_some_layers(tline: &Arc<dyn Timeline>, start_lsn: Lsn) -> Result<()> {
|
||||
let mut lsn = start_lsn;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
// Create a relation on the timeline
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
{
|
||||
let writer = tline.writer();
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
lsn += 0x10;
|
||||
writer.put_page_image(
|
||||
TESTREL_A,
|
||||
0,
|
||||
lsn,
|
||||
TEST_IMG(&format!("foo blk 0 at {}", lsn)),
|
||||
)?;
|
||||
writer.advance_last_record_lsn(lsn);
|
||||
}
|
||||
tline.checkpoint(CheckpointConfig::Forced)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_garbage_collected_data() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("we might've already garbage collected needed data"))
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> Result<()> {
|
||||
let repo = RepoHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
||||
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||
assert!(&err
|
||||
.source()
|
||||
.unwrap()
|
||||
.to_string()
|
||||
.contains("is earlier than initdb lsn"));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_prohibit_get_page_at_lsn_for_garbage_collected_pages() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_prohibit_get_page_at_lsn_for_garbage_collected_pages")?
|
||||
.load();
|
||||
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
match tline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)) {
|
||||
Ok(_) => panic!("request for page should have failed"),
|
||||
Err(err) => assert!(err
|
||||
.to_string()
|
||||
.contains("tried to request a page version that was garbage collected")),
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_retain_data_in_parent_which_is_needed_for_child() -> Result<()> {
|
||||
let repo =
|
||||
RepoHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
assert!(newtline.get_page_at_lsn(TESTREL_A, 0, Lsn(0x25)).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parent_keeps_data_forever_after_branching() -> Result<()> {
|
||||
let harness = RepoHarness::create("test_parent_keeps_data_forever_after_branching")?;
|
||||
let repo = harness.load();
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
make_some_layers(&tline, Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo.get_timeline(NEW_TIMELINE_ID)?;
|
||||
|
||||
make_some_layers(&newtline, Lsn(0x60))?;
|
||||
|
||||
// run gc on parent
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
// check that the layer in parent before the branching point is still there
|
||||
let tline_dir = harness.conf.timeline_path(&TIMELINE_ID, &harness.tenant_id);
|
||||
|
||||
let expected_image_layer_path = tline_dir.join(format!(
|
||||
"rel_{}_{}_{}_{}_{}_{:016X}_{:016X}",
|
||||
TESTREL_A_REL_TAG.spcnode,
|
||||
TESTREL_A_REL_TAG.dbnode,
|
||||
TESTREL_A_REL_TAG.relnode,
|
||||
TESTREL_A_REL_TAG.forknum,
|
||||
0, // seg is 0
|
||||
0x20,
|
||||
0x30,
|
||||
));
|
||||
assert!(fs::metadata(&expected_image_layer_path).is_ok());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn corrupt_metadata() -> Result<()> {
|
||||
const TEST_NAME: &str = "corrupt_metadata";
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
|
||||
@@ -968,11 +804,7 @@ mod tests {
|
||||
|
||||
let new_repo = harness.load();
|
||||
let err = new_repo.get_timeline(TIMELINE_ID).err().unwrap();
|
||||
assert_eq!(err.to_string(), "failed to load metadata");
|
||||
assert_eq!(
|
||||
err.source().unwrap().to_string(),
|
||||
"metadata checksum mismatch"
|
||||
);
|
||||
assert!(err.to_string().contains("checksum"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -983,12 +815,7 @@ mod tests {
|
||||
let harness = RepoHarness::create(TEST_NAME)?;
|
||||
let repo = harness.load();
|
||||
|
||||
// Create a timeline with disk_consistent_lsn = 8000
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0x8000))?;
|
||||
let writer = tline.writer();
|
||||
writer.advance_last_record_lsn(Lsn(0x8000));
|
||||
drop(writer);
|
||||
repo.checkpoint_iteration(CheckpointConfig::Forced)?;
|
||||
repo.create_empty_timeline(TIMELINE_ID)?;
|
||||
drop(repo);
|
||||
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
@@ -1002,85 +829,38 @@ mod tests {
|
||||
Ok(())
|
||||
};
|
||||
|
||||
// Helper function to check that a relation file exists, and a corresponding
|
||||
// <filename>.0.old file does not.
|
||||
let assert_exists = |filename: &str| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(path.exists(), "file {} was removed", filename);
|
||||
let image_filename = format!("pg_control_0_{:016X}", 8000);
|
||||
let delta_filename = format!("pg_control_0_{:016X}_{:016X}", 8000, 8008);
|
||||
|
||||
// Check that there is no .old file
|
||||
let backup_path = timeline_path.join(format!("{}.0.old", filename));
|
||||
assert!(
|
||||
!backup_path.exists(),
|
||||
"unexpected backup file {}",
|
||||
backup_path.display()
|
||||
);
|
||||
};
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
// Helper function to check that a relation file does *not* exists, and a corresponding
|
||||
// <filename>.<num>.old file does.
|
||||
let assert_is_renamed = |filename: &str, num: u32| {
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
let check_old = |filename: &str, num: u32| {
|
||||
let path = timeline_path.join(filename);
|
||||
assert!(
|
||||
!path.exists(),
|
||||
"file {} was not removed as expected",
|
||||
filename
|
||||
);
|
||||
assert!(!path.exists());
|
||||
|
||||
let backup_path = timeline_path.join(format!("{}.{}.old", filename, num));
|
||||
assert!(
|
||||
backup_path.exists(),
|
||||
"backup file {} was not created",
|
||||
backup_path.display()
|
||||
);
|
||||
assert!(backup_path.exists());
|
||||
};
|
||||
|
||||
// These files are considered to be in the future and will be renamed out
|
||||
// of the way
|
||||
let future_filenames = vec![
|
||||
format!("pg_control_0_{:016X}", 0x8001),
|
||||
format!("pg_control_0_{:016X}_{:016X}", 0x8001, 0x8008),
|
||||
];
|
||||
// But these are not:
|
||||
let past_filenames = vec![
|
||||
format!("pg_control_0_{:016X}", 0x8000),
|
||||
format!("pg_control_0_{:016X}_{:016X}", 0x7000, 0x8001),
|
||||
];
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
|
||||
for filename in future_filenames.iter().chain(past_filenames.iter()) {
|
||||
make_empty_file(filename)?;
|
||||
}
|
||||
|
||||
// Load the timeline. This will cause the files in the "future" to be renamed
|
||||
// away.
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
for filename in future_filenames.iter() {
|
||||
assert_is_renamed(filename, 0);
|
||||
}
|
||||
for filename in past_filenames.iter() {
|
||||
assert_exists(filename);
|
||||
}
|
||||
|
||||
// Create the future files again, and load again. They should be renamed to
|
||||
// *.1.old this time.
|
||||
for filename in future_filenames.iter() {
|
||||
make_empty_file(filename)?;
|
||||
}
|
||||
make_empty_file(&image_filename)?;
|
||||
make_empty_file(&delta_filename)?;
|
||||
|
||||
let new_repo = harness.load();
|
||||
new_repo.get_timeline(TIMELINE_ID).unwrap();
|
||||
drop(new_repo);
|
||||
|
||||
for filename in future_filenames.iter() {
|
||||
assert_is_renamed(filename, 0);
|
||||
assert_is_renamed(filename, 1);
|
||||
}
|
||||
for filename in past_filenames.iter() {
|
||||
assert_exists(filename);
|
||||
}
|
||||
check_old(&image_filename, 0);
|
||||
check_old(&delta_filename, 0);
|
||||
check_old(&image_filename, 1);
|
||||
check_old(&delta_filename, 1);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ use std::io::{Read, Seek, SeekFrom};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use bytes::{Buf, Bytes};
|
||||
use tracing::*;
|
||||
|
||||
use crate::relish::*;
|
||||
@@ -416,6 +416,7 @@ pub fn save_decoded_record(
|
||||
if checkpoint.update_next_xid(decoded.xl_xid) {
|
||||
*checkpoint_modified = true;
|
||||
}
|
||||
|
||||
// Iterate through all the blocks that the record modifies, and
|
||||
// "put" a separate copy of the record for each block.
|
||||
for blk in decoded.blocks.iter() {
|
||||
@@ -426,43 +427,13 @@ pub fn save_decoded_record(
|
||||
forknum: blk.forknum as u8,
|
||||
});
|
||||
|
||||
//
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
if blk.apply_image
|
||||
&& blk.has_image
|
||||
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||
&& (blk.bimg_info & pg_constants::BKPIMAGE_IS_COMPRESSED) == 0
|
||||
{
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(pg_constants::BLCKSZ as usize);
|
||||
image.extend_from_slice(&recdata[img_offs..img_offs + img_len]);
|
||||
let rec = WALRecord {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
image[0..4].copy_from_slice(&((lsn.0 >> 32) as u32).to_le_bytes());
|
||||
image[4..8].copy_from_slice(&(lsn.0 as u32).to_le_bytes());
|
||||
assert_eq!(image.len(), pg_constants::BLCKSZ as usize);
|
||||
timeline.put_page_image(tag, blk.blkno, lsn, image.freeze())?;
|
||||
} else {
|
||||
let rec = WALRecord {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: recdata.clone(),
|
||||
main_data_offset: decoded.main_data_offset as u32,
|
||||
};
|
||||
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
|
||||
}
|
||||
timeline.put_wal_record(lsn, tag, blk.blkno, rec)?;
|
||||
}
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
|
||||
@@ -9,7 +9,7 @@ use crate::walredo::PostgresRedoManager;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use log::{debug, info};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
@@ -49,17 +49,6 @@ pub enum TenantState {
|
||||
Stopping,
|
||||
}
|
||||
|
||||
/// A remote storage timeline synchronization event, that needs another step
|
||||
/// to be fully completed.
|
||||
#[derive(Debug)]
|
||||
pub enum PostTimelineSyncStep {
|
||||
/// The timeline cannot be synchronized anymore due to some sync issues.
|
||||
/// Needs to be removed from pageserver, to avoid further data diverging.
|
||||
Evict,
|
||||
/// A new timeline got downloaded and needs to be loaded into pageserver.
|
||||
RegisterDownload,
|
||||
}
|
||||
|
||||
impl fmt::Display for TenantState {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
@@ -106,66 +95,51 @@ fn init_repo(conf: &'static PageServerConf, tenant_id: ZTenantId) {
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
false,
|
||||
true,
|
||||
));
|
||||
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenant_id).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
tenant.state = TenantState::Active;
|
||||
|
||||
// TODO Start these threads only if tenant actively receives some WAL
|
||||
tenant_threads::start_tenant_threads(conf, tenant_id);
|
||||
}
|
||||
|
||||
pub fn perform_post_timeline_sync_steps(
|
||||
pub fn register_relish_download(
|
||||
conf: &'static PageServerConf,
|
||||
post_sync_steps: HashMap<(ZTenantId, ZTimelineId), PostTimelineSyncStep>,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) {
|
||||
if post_sync_steps.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
info!("Performing {} post-sync steps", post_sync_steps.len());
|
||||
trace!("Steps: {:?}", post_sync_steps);
|
||||
log::info!(
|
||||
"Registering new download, tenant id {}, timeline id: {}",
|
||||
tenant_id,
|
||||
timeline_id
|
||||
);
|
||||
|
||||
{
|
||||
let mut m = access_tenants();
|
||||
for &(tenant_id, timeline_id) in post_sync_steps.keys() {
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
|
||||
state: TenantState::Downloading,
|
||||
repo: None,
|
||||
});
|
||||
tenant.state = TenantState::Downloading;
|
||||
match &tenant.repo {
|
||||
Some(repo) => {
|
||||
init_timeline(repo.as_ref(), timeline_id);
|
||||
tenant.state = TenantState::Idle;
|
||||
return;
|
||||
}
|
||||
None => log::warn!("Initialize new repo"),
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| Tenant {
|
||||
state: TenantState::Downloading,
|
||||
repo: None,
|
||||
});
|
||||
tenant.state = TenantState::Downloading;
|
||||
match &tenant.repo {
|
||||
Some(repo) => {
|
||||
init_timeline(repo.as_ref(), timeline_id);
|
||||
tenant.state = TenantState::Active;
|
||||
return;
|
||||
}
|
||||
tenant.state = TenantState::Idle;
|
||||
None => log::warn!("Initialize new repo"),
|
||||
}
|
||||
tenant.state = TenantState::Active;
|
||||
}
|
||||
|
||||
for ((tenant_id, timeline_id), post_sync_step) in post_sync_steps {
|
||||
match post_sync_step {
|
||||
PostTimelineSyncStep::Evict => {
|
||||
if let Err(e) = get_repository_for_tenant(tenant_id)
|
||||
.and_then(|repo| repo.unload_timeline(timeline_id))
|
||||
{
|
||||
error!(
|
||||
"Failed to remove repository for tenant {}, timeline {}: {:#}",
|
||||
tenant_id, timeline_id, e
|
||||
)
|
||||
}
|
||||
}
|
||||
PostTimelineSyncStep::RegisterDownload => {
|
||||
// init repo updates Tenant state
|
||||
init_repo(conf, tenant_id);
|
||||
let new_repo = get_repository_for_tenant(tenant_id).unwrap();
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
// init repo updates Tenant state
|
||||
init_repo(conf, tenant_id);
|
||||
let new_repo = get_repository_for_tenant(tenant_id).unwrap();
|
||||
init_timeline(new_repo.as_ref(), timeline_id);
|
||||
}
|
||||
|
||||
fn init_timeline(repo: &dyn Repository, timeline_id: ZTimelineId) {
|
||||
@@ -223,7 +197,7 @@ pub fn create_repository_for_tenant(
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid).unwrap();
|
||||
tenant.repo = Some(repo);
|
||||
tenant.state = TenantState::Idle;
|
||||
tenant.state = TenantState::Active;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -237,18 +211,18 @@ pub fn get_tenant_state(tenantid: ZTenantId) -> TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_tenant_state(tenantid: ZTenantId, newstate: TenantState) -> Result<TenantState> {
|
||||
pub fn set_tenant_state(tenantid: ZTenantId, state: TenantState) -> Result<TenantState> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.get_mut(&tenantid);
|
||||
|
||||
match tenant {
|
||||
Some(tenant) => {
|
||||
if newstate == TenantState::Idle && tenant.state != TenantState::Active {
|
||||
if state == TenantState::Idle && tenant.state != TenantState::Active {
|
||||
// Only Active tenant can become Idle
|
||||
return Ok(tenant.state);
|
||||
}
|
||||
info!("set_tenant_state: {} -> {}", tenant.state, newstate);
|
||||
tenant.state = newstate;
|
||||
info!("set_tenant_state: {} -> {}", tenant.state, state);
|
||||
tenant.state = state;
|
||||
Ok(tenant.state)
|
||||
}
|
||||
None => bail!("Tenant not found for tenant {}", tenantid),
|
||||
|
||||
@@ -11,7 +11,6 @@ use std::sync::Mutex;
|
||||
use std::thread::JoinHandle;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
struct TenantHandleEntry {
|
||||
@@ -26,44 +25,33 @@ lazy_static! {
|
||||
Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref TENANT_THREADS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"tenant_threads_count",
|
||||
"Number of live tenant threads",
|
||||
&["tenant_thread_type"]
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
}
|
||||
|
||||
// Launch checkpointer and GC for the tenant.
|
||||
// It's possible that the threads are running already,
|
||||
// if so, just don't spawn new ones.
|
||||
pub fn start_tenant_threads(conf: &'static PageServerConf, tenantid: ZTenantId) {
|
||||
//ensure that old threads are stopeed
|
||||
wait_for_tenant_threads_to_stop(tenantid);
|
||||
|
||||
let checkpointer_handle = std::thread::Builder::new()
|
||||
.name("Checkpointer thread".into())
|
||||
.spawn(move || {
|
||||
checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
|
||||
})
|
||||
.ok();
|
||||
|
||||
let gc_handle = std::thread::Builder::new()
|
||||
.name("GC thread".into())
|
||||
.spawn(move || {
|
||||
gc_loop(tenantid, conf).expect("GC thread died");
|
||||
})
|
||||
.ok();
|
||||
|
||||
// TODO handle thread errors if any
|
||||
|
||||
let mut handles = TENANT_HANDLES.lock().unwrap();
|
||||
let h = handles
|
||||
.entry(tenantid)
|
||||
.or_insert_with(|| TenantHandleEntry {
|
||||
checkpointer_handle: None,
|
||||
gc_handle: None,
|
||||
});
|
||||
let h = TenantHandleEntry {
|
||||
checkpointer_handle,
|
||||
gc_handle,
|
||||
};
|
||||
|
||||
if h.checkpointer_handle.is_none() {
|
||||
h.checkpointer_handle = std::thread::Builder::new()
|
||||
.name("Checkpointer thread".into())
|
||||
.spawn(move || {
|
||||
checkpoint_loop(tenantid, conf).expect("Checkpointer thread died");
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
|
||||
if h.gc_handle.is_none() {
|
||||
h.gc_handle = std::thread::Builder::new()
|
||||
.name("GC thread".into())
|
||||
.spawn(move || {
|
||||
gc_loop(tenantid, conf).expect("GC thread died");
|
||||
})
|
||||
.ok();
|
||||
}
|
||||
handles.insert(tenantid, h);
|
||||
}
|
||||
|
||||
pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
|
||||
@@ -81,12 +69,6 @@ pub fn wait_for_tenant_threads_to_stop(tenantid: ZTenantId) {
|
||||
/// Checkpointer thread's main loop
|
||||
///
|
||||
fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
let gauge = TENANT_THREADS_COUNT.with_label_values(&["checkpointer"]);
|
||||
gauge.inc();
|
||||
scopeguard::defer! {
|
||||
gauge.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
break;
|
||||
@@ -113,12 +95,6 @@ fn checkpoint_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result
|
||||
/// GC thread's main loop
|
||||
///
|
||||
fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
let gauge = TENANT_THREADS_COUNT.with_label_values(&["gc"]);
|
||||
gauge.inc();
|
||||
scopeguard::defer! {
|
||||
gauge.dec();
|
||||
}
|
||||
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != TenantState::Active {
|
||||
break;
|
||||
|
||||
@@ -1,619 +0,0 @@
|
||||
//!
|
||||
//! VirtualFile is like a normal File, but it's not bound directly to
|
||||
//! a file descriptor. Instead, the file is opened when it's read from,
|
||||
//! and if too many files are open globally in the system, least-recently
|
||||
//! used ones are closed.
|
||||
//!
|
||||
//! To track which files have been recently used, we use the clock algorithm
|
||||
//! with a 'recently_used' flag on each slot.
|
||||
//!
|
||||
//! This is similar to PostgreSQL's virtual file descriptor facility in
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
///
|
||||
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
|
||||
/// the underlying file is closed if the system is low on file descriptors,
|
||||
/// and re-opened when it's accessed again.
|
||||
///
|
||||
/// Like with std::fs::File, multiple threads can read/write the file concurrently,
|
||||
/// holding just a shared reference the same VirtualFile, using the read_at() / write_at()
|
||||
/// functions from the FileExt trait. But the functions from the Read/Write/Seek traits
|
||||
/// require a mutable reference, because they modify the "current position".
|
||||
///
|
||||
/// Each VirtualFile has a physical file descriptor in the global OPEN_FILES array, at the
|
||||
/// slot that 'handle points to, if the underlying file is currently open. If it's not
|
||||
/// currently open, the 'handle' can still point to the slot where it was last kept. The
|
||||
/// 'tag' field is used to detect whether the handle still is valid or not.
|
||||
///
|
||||
pub struct VirtualFile {
|
||||
/// Lazy handle to the global file descriptor cache. The slot that this points to
|
||||
/// might contain our File, or it may be empty, or it may contain a File that
|
||||
/// belongs to a different VirtualFile.
|
||||
handle: RwLock<SlotHandle>,
|
||||
|
||||
/// Current file position
|
||||
pos: u64,
|
||||
|
||||
/// File path and options to use to open it.
|
||||
///
|
||||
/// Note: this only contains the options needed to re-open it. For example,
|
||||
/// if a new file is created, we only pass the create flag when it's initially
|
||||
/// opened, in the VirtualFile::create() function, and strip the flag before
|
||||
/// storing it here.
|
||||
pub path: PathBuf,
|
||||
open_options: OpenOptions,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Clone, Copy)]
|
||||
struct SlotHandle {
|
||||
/// Index into OPEN_FILES.slots
|
||||
index: usize,
|
||||
|
||||
/// Value of 'tag' in the slot. If slot's tag doesn't match, then the slot has
|
||||
/// been recycled and no longer contains the FD for this virtual file.
|
||||
tag: u64,
|
||||
}
|
||||
|
||||
/// OPEN_FILES is the global array that holds the physical file descriptors that
|
||||
/// are currently open. Each slot in the array is protected by a separate lock,
|
||||
/// so that different files can be accessed independently. The lock must be held
|
||||
/// in write mode to replace the slot with a different file, but a read mode
|
||||
/// is enough to operate on the file, whether you're reading or writing to it.
|
||||
///
|
||||
/// OPEN_FILES starts in uninitialized state, and it's initialized by
|
||||
/// the virtual_file::init() function. It must be called exactly once at page
|
||||
/// server startup.
|
||||
static OPEN_FILES: OnceCell<OpenFiles> = OnceCell::new();
|
||||
|
||||
struct OpenFiles {
|
||||
slots: &'static [Slot],
|
||||
|
||||
/// clock arm for the clock algorithm
|
||||
next: AtomicUsize,
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
inner: RwLock<SlotInner>,
|
||||
|
||||
/// has this file been used since last clock sweep?
|
||||
recently_used: AtomicBool,
|
||||
}
|
||||
|
||||
struct SlotInner {
|
||||
/// Counter that's incremented every time a different file is stored here.
|
||||
/// To avoid the ABA problem.
|
||||
tag: u64,
|
||||
|
||||
/// the underlying file
|
||||
file: Option<File>,
|
||||
}
|
||||
|
||||
impl OpenFiles {
|
||||
/// Find a slot to use, evicting an existing file descriptor if needed.
|
||||
///
|
||||
/// On return, we hold a lock on the slot, and its 'tag' has been updated
|
||||
/// recently_used has been set. It's all ready for reuse.
|
||||
fn find_victim_slot(&self) -> (SlotHandle, RwLockWriteGuard<SlotInner>) {
|
||||
//
|
||||
// Run the clock algorithm to find a slot to replace.
|
||||
//
|
||||
let num_slots = self.slots.len();
|
||||
let mut retries = 0;
|
||||
let mut slot;
|
||||
let mut slot_guard;
|
||||
let index;
|
||||
loop {
|
||||
let next = self.next.fetch_add(1, Ordering::AcqRel) % num_slots;
|
||||
slot = &self.slots[next];
|
||||
|
||||
// If the recently_used flag on this slot is set, continue the clock
|
||||
// sweep. Otherwise try to use this slot. If we cannot acquire the
|
||||
// lock, also continue the clock sweep.
|
||||
//
|
||||
// We only continue in this manner for a while, though. If we loop
|
||||
// through the array twice without finding a victim, just pick the
|
||||
// next slot and wait until we can reuse it. This way, we avoid
|
||||
// spinning in the extreme case that all the slots are busy with an
|
||||
// I/O operation.
|
||||
if retries < num_slots * 2 {
|
||||
if !slot.recently_used.swap(false, Ordering::Release) {
|
||||
if let Ok(guard) = slot.inner.try_write() {
|
||||
slot_guard = guard;
|
||||
index = next;
|
||||
break;
|
||||
}
|
||||
}
|
||||
retries += 1;
|
||||
} else {
|
||||
slot_guard = slot.inner.write().unwrap();
|
||||
index = next;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// We now have the victim slot locked. If it was in use previously, close the
|
||||
// old file.
|
||||
//
|
||||
if let Some(old_file) = slot_guard.file.take() {
|
||||
drop(old_file);
|
||||
}
|
||||
|
||||
// Prepare the slot for reuse and return it
|
||||
slot_guard.tag += 1;
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
(
|
||||
SlotHandle {
|
||||
index,
|
||||
tag: slot_guard.tag,
|
||||
},
|
||||
slot_guard,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl VirtualFile {
|
||||
/// Open a file in read-only mode. Like File::open.
|
||||
pub fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
Self::open_with_options(path, OpenOptions::new().read(true))
|
||||
}
|
||||
|
||||
/// Create a new file for writing. If the file exists, it will be truncated.
|
||||
/// Like File::create.
|
||||
pub fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
|
||||
Self::open_with_options(
|
||||
path,
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
)
|
||||
}
|
||||
|
||||
/// Open a file with given options.
|
||||
///
|
||||
/// Note: If any custom flags were set in 'open_options' through OpenOptionsExt,
|
||||
/// they will be applied also when the file is subsequently re-opened, not only
|
||||
/// on the first time. Make sure that's sane!
|
||||
pub fn open_with_options(
|
||||
path: &Path,
|
||||
open_options: &OpenOptions,
|
||||
) -> Result<VirtualFile, std::io::Error> {
|
||||
let (handle, mut slot_guard) = get_open_files().find_victim_slot();
|
||||
|
||||
let file = open_options.open(path)?;
|
||||
|
||||
// Strip all options other than read and write.
|
||||
//
|
||||
// It would perhaps be nicer to check just for the read and write flags
|
||||
// explicitly, but OpenOptions doesn't contain any functions to read flags,
|
||||
// only to set them.
|
||||
let mut reopen_options = open_options.clone();
|
||||
reopen_options.create(false);
|
||||
reopen_options.create_new(false);
|
||||
reopen_options.truncate(false);
|
||||
|
||||
let vfile = VirtualFile {
|
||||
handle: RwLock::new(handle),
|
||||
pos: 0,
|
||||
path: path.to_path_buf(),
|
||||
open_options: reopen_options,
|
||||
};
|
||||
|
||||
slot_guard.file.replace(file);
|
||||
|
||||
Ok(vfile)
|
||||
}
|
||||
|
||||
/// Call File::sync_all() on the underlying File.
|
||||
pub fn sync_all(&self) -> Result<(), Error> {
|
||||
self.with_file(|file| file.sync_all())?
|
||||
}
|
||||
|
||||
/// Helper function that looks up the underlying File for this VirtualFile,
|
||||
/// opening it and evicting some other File if necessary. It calls 'func'
|
||||
/// with the physical File.
|
||||
fn with_file<F, R>(&self, mut func: F) -> Result<R, Error>
|
||||
where
|
||||
F: FnMut(&File) -> R,
|
||||
{
|
||||
let open_files = get_open_files();
|
||||
|
||||
let mut handle_guard = {
|
||||
// Read the cached slot handle, and see if the slot that it points to still
|
||||
// contains our File.
|
||||
//
|
||||
// We only need to hold the handle lock while we read the current handle. If
|
||||
// another thread closes the file and recycles the slot for a different file,
|
||||
// we will notice that the handle we read is no longer valid and retry.
|
||||
let mut handle = *self.handle.read().unwrap();
|
||||
loop {
|
||||
// Check if the slot contains our File
|
||||
{
|
||||
let slot = &open_files.slots[handle.index];
|
||||
let slot_guard = slot.inner.read().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
if let Some(file) = &slot_guard.file {
|
||||
// Found a cached file descriptor.
|
||||
slot.recently_used.store(true, Ordering::Relaxed);
|
||||
return Ok(func(file));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The slot didn't contain our File. We will have to open it ourselves,
|
||||
// but before that, grab a write lock on handle in the VirtualFile, so
|
||||
// that no other thread will try to concurrently open the same file.
|
||||
let handle_guard = self.handle.write().unwrap();
|
||||
|
||||
// If another thread changed the handle while we were not holding the lock,
|
||||
// then the handle might now be valid again. Loop back to retry.
|
||||
if *handle_guard != handle {
|
||||
handle = *handle_guard;
|
||||
continue;
|
||||
}
|
||||
break handle_guard;
|
||||
}
|
||||
};
|
||||
|
||||
// We need to open the file ourselves. The handle in the VirtualFile is
|
||||
// now locked in write-mode. Find a free slot to put it in.
|
||||
let (handle, mut slot_guard) = open_files.find_victim_slot();
|
||||
|
||||
// Open the physical file
|
||||
let file = self.open_options.open(&self.path)?;
|
||||
|
||||
// Perform the requested operation on it
|
||||
//
|
||||
// TODO: We could downgrade the locks to read mode before calling
|
||||
// 'func', to allow a little bit more concurrency, but the standard
|
||||
// library RwLock doesn't allow downgrading without releasing the lock,
|
||||
// and that doesn't seem worth the trouble. (parking_lot RwLock would
|
||||
// allow it)
|
||||
let result = func(&file);
|
||||
|
||||
// Store the File in the slot and update the handle in the VirtualFile
|
||||
// to point to it.
|
||||
slot_guard.file.replace(file);
|
||||
|
||||
*handle_guard = handle;
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for VirtualFile {
|
||||
/// If a VirtualFile is dropped, close the underlying file if it was open.
|
||||
fn drop(&mut self) {
|
||||
let handle = self.handle.get_mut().unwrap();
|
||||
|
||||
// We could check with a read-lock first, to avoid waiting on an
|
||||
// unrelated I/O.
|
||||
let slot = &get_open_files().slots[handle.index];
|
||||
let mut slot_guard = slot.inner.write().unwrap();
|
||||
if slot_guard.tag == handle.tag {
|
||||
slot.recently_used.store(false, Ordering::Relaxed);
|
||||
slot_guard.file.take();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for VirtualFile {
|
||||
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
|
||||
let pos = self.pos;
|
||||
let n = self.read_at(buf, pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
impl Write for VirtualFile {
|
||||
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
|
||||
let pos = self.pos;
|
||||
let n = self.write_at(buf, pos)?;
|
||||
self.pos += n as u64;
|
||||
Ok(n)
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> Result<(), std::io::Error> {
|
||||
// flush is no-op for File (at least on unix), so we don't need to do
|
||||
// anything here either.
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Seek for VirtualFile {
|
||||
fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
|
||||
match pos {
|
||||
SeekFrom::Start(offset) => {
|
||||
self.pos = offset;
|
||||
}
|
||||
SeekFrom::End(offset) => {
|
||||
self.pos = self.with_file(|mut file| file.seek(SeekFrom::End(offset)))??
|
||||
}
|
||||
SeekFrom::Current(offset) => {
|
||||
let pos = self.pos as i128 + offset as i128;
|
||||
if pos < 0 {
|
||||
return Err(Error::new(
|
||||
ErrorKind::InvalidInput,
|
||||
"offset would be negative",
|
||||
));
|
||||
}
|
||||
if pos > u64::MAX as i128 {
|
||||
return Err(Error::new(ErrorKind::InvalidInput, "offset overflow"));
|
||||
}
|
||||
self.pos = pos as u64;
|
||||
}
|
||||
}
|
||||
Ok(self.pos)
|
||||
}
|
||||
}
|
||||
|
||||
impl FileExt for VirtualFile {
|
||||
fn read_at(&self, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
|
||||
self.with_file(|file| file.read_at(buf, offset))?
|
||||
}
|
||||
|
||||
fn write_at(&self, buf: &[u8], offset: u64) -> Result<usize, Error> {
|
||||
self.with_file(|file| file.write_at(buf, offset))?
|
||||
}
|
||||
}
|
||||
|
||||
impl OpenFiles {
|
||||
fn new(num_slots: usize) -> OpenFiles {
|
||||
let mut slots = Box::new(Vec::with_capacity(num_slots));
|
||||
for _ in 0..num_slots {
|
||||
let slot = Slot {
|
||||
recently_used: AtomicBool::new(false),
|
||||
inner: RwLock::new(SlotInner { tag: 0, file: None }),
|
||||
};
|
||||
slots.push(slot);
|
||||
}
|
||||
|
||||
OpenFiles {
|
||||
next: AtomicUsize::new(0),
|
||||
slots: Box::leak(slots),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Initialize the virtual file module. This must be called once at page
|
||||
/// server startup.
|
||||
///
|
||||
pub fn init(num_slots: usize) {
|
||||
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
|
||||
panic!("virtual_file::init called twice");
|
||||
}
|
||||
}
|
||||
|
||||
const TEST_MAX_FILE_DESCRIPTORS: usize = 10;
|
||||
|
||||
// Get a handle to the global slots array.
|
||||
fn get_open_files() -> &'static OpenFiles {
|
||||
//
|
||||
// In unit tests, page server startup doesn't happen and no one calls
|
||||
// virtual_file::init(). Initialize it here, with a small array.
|
||||
//
|
||||
// This applies to the virtual file tests below, but all other unit
|
||||
// tests too, so the virtual file facility is always usable in
|
||||
// unit tests.
|
||||
//
|
||||
if cfg!(test) {
|
||||
OPEN_FILES.get_or_init(|| OpenFiles::new(TEST_MAX_FILE_DESCRIPTORS))
|
||||
} else {
|
||||
OPEN_FILES.get().expect("virtual_file::init not called yet")
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::seq::SliceRandom;
|
||||
use rand::thread_rng;
|
||||
use rand::Rng;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
// Helper function to slurp contents of a file, starting at the current position,
|
||||
// into a string
|
||||
fn read_string<FD>(vfile: &mut FD) -> Result<String, Error>
|
||||
where
|
||||
FD: Read,
|
||||
{
|
||||
let mut buf = String::new();
|
||||
vfile.read_to_string(&mut buf)?;
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
// Helper function to slurp a portion of a file into a string
|
||||
fn read_string_at<FD>(vfile: &mut FD, pos: u64, len: usize) -> Result<String, Error>
|
||||
where
|
||||
FD: FileExt,
|
||||
{
|
||||
let mut buf = Vec::new();
|
||||
buf.resize(len, 0);
|
||||
vfile.read_exact_at(&mut buf, pos)?;
|
||||
Ok(String::from_utf8(buf).unwrap())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_virtual_files() -> Result<(), Error> {
|
||||
// The real work is done in the test_files() helper function. This
|
||||
// allows us to run the same set of tests against a native File, and
|
||||
// VirtualFile. We trust the native Files and wouldn't need to test them,
|
||||
// but this allows us to verify that the operations return the same
|
||||
// results with VirtualFiles as with native Files. (Except that with
|
||||
// native files, you will run out of file descriptors if the ulimit
|
||||
// is low enough.)
|
||||
test_files("virtual_files", |path, open_options| {
|
||||
VirtualFile::open_with_options(path, open_options)
|
||||
})
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_physical_files() -> Result<(), Error> {
|
||||
test_files("physical_files", |path, open_options| {
|
||||
open_options.open(path)
|
||||
})
|
||||
}
|
||||
|
||||
fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
|
||||
where
|
||||
FD: Read + Write + Seek + FileExt,
|
||||
OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
|
||||
{
|
||||
let testdir = crate::PageServerConf::test_repo_dir(testname);
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
let path_a = testdir.join("file_a");
|
||||
let mut file_a = openfunc(
|
||||
&path_a,
|
||||
OpenOptions::new().write(true).create(true).truncate(true),
|
||||
)?;
|
||||
file_a.write_all(b"foobar")?;
|
||||
|
||||
// cannot read from a file opened in write-only mode
|
||||
assert!(read_string(&mut file_a).is_err());
|
||||
|
||||
// Close the file and re-open for reading
|
||||
let mut file_a = openfunc(&path_a, OpenOptions::new().read(true))?;
|
||||
|
||||
// cannot write to a file opened in read-only mode
|
||||
assert!(file_a.write(b"bar").is_err());
|
||||
|
||||
// Try simple read
|
||||
assert_eq!("foobar", read_string(&mut file_a)?);
|
||||
|
||||
// It's positioned at the EOF now.
|
||||
assert_eq!("", read_string(&mut file_a)?);
|
||||
|
||||
// Test seeks.
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::End(-2))?, 4);
|
||||
assert_eq!("ar", read_string(&mut file_a)?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(2))?, 3);
|
||||
assert_eq!("bar", read_string(&mut file_a)?);
|
||||
|
||||
assert_eq!(file_a.seek(SeekFrom::Current(-5))?, 1);
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
// Test erroneous seeks to before byte 0
|
||||
assert!(file_a.seek(SeekFrom::End(-7)).is_err());
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
assert!(file_a.seek(SeekFrom::Current(-2)).is_err());
|
||||
|
||||
// the erroneous seek should have left the position unchanged
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
// Create another test file, and try FileExt functions on it.
|
||||
let path_b = testdir.join("file_b");
|
||||
let mut file_b = openfunc(
|
||||
&path_b,
|
||||
OpenOptions::new()
|
||||
.read(true)
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true),
|
||||
)?;
|
||||
file_b.write_all_at(b"BAR", 3)?;
|
||||
file_b.write_all_at(b"FOO", 0)?;
|
||||
|
||||
assert_eq!(read_string_at(&mut file_b, 2, 3)?, "OBA");
|
||||
|
||||
// Open a lot of files, enough to cause some evictions. (Or to be precise,
|
||||
// open the same file many times. The effect is the same.)
|
||||
//
|
||||
// leave file_a positioned at offset 1 before we start
|
||||
assert_eq!(file_a.seek(SeekFrom::Start(1))?, 1);
|
||||
|
||||
let mut vfiles = Vec::new();
|
||||
for _ in 0..100 {
|
||||
let mut vfile = openfunc(&path_b, OpenOptions::new().read(true))?;
|
||||
assert_eq!("FOOBAR", read_string(&mut vfile)?);
|
||||
vfiles.push(vfile);
|
||||
}
|
||||
|
||||
// make sure we opened enough files to definitely cause evictions.
|
||||
assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2);
|
||||
|
||||
// The underlying file descriptor for 'file_a' should be closed now. Try to read
|
||||
// from it again. We left the file positioned at offset 1 above.
|
||||
assert_eq!("oobar", read_string(&mut file_a)?);
|
||||
|
||||
// Check that all the other FDs still work too. Use them in random order for
|
||||
// good measure.
|
||||
vfiles.as_mut_slice().shuffle(&mut thread_rng());
|
||||
for vfile in vfiles.iter_mut() {
|
||||
assert_eq!("OOBAR", read_string_at(vfile, 1, 5)?);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Test using VirtualFiles from many threads concurrently. This tests both using
|
||||
/// a lot of VirtualFiles concurrently, causing evictions, and also using the same
|
||||
/// VirtualFile from multiple threads concurrently.
|
||||
#[test]
|
||||
fn test_vfile_concurrency() -> Result<(), Error> {
|
||||
const SIZE: usize = 8 * 1024;
|
||||
const VIRTUAL_FILES: usize = 100;
|
||||
const THREADS: usize = 100;
|
||||
const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
|
||||
|
||||
let testdir = crate::PageServerConf::test_repo_dir("vfile_concurrency");
|
||||
std::fs::create_dir_all(&testdir)?;
|
||||
|
||||
// Create a test file.
|
||||
let test_file_path = testdir.join("concurrency_test_file");
|
||||
{
|
||||
let file = File::create(&test_file_path)?;
|
||||
file.write_all_at(&SAMPLE, 0)?;
|
||||
}
|
||||
|
||||
// Open the file many times.
|
||||
let mut files = Vec::new();
|
||||
for _ in 0..VIRTUAL_FILES {
|
||||
let f = VirtualFile::open_with_options(&test_file_path, OpenOptions::new().read(true))?;
|
||||
files.push(f);
|
||||
}
|
||||
let files = Arc::new(files);
|
||||
|
||||
// Launch many threads, and use the virtual files concurrently in random order.
|
||||
let mut threads = Vec::new();
|
||||
for threadno in 0..THREADS {
|
||||
let builder =
|
||||
thread::Builder::new().name(format!("test_vfile_concurrency thread {}", threadno));
|
||||
|
||||
let files = files.clone();
|
||||
let thread = builder
|
||||
.spawn(move || {
|
||||
let mut buf = [0u8; SIZE];
|
||||
let mut rng = rand::thread_rng();
|
||||
for _ in 1..1000 {
|
||||
let f = &files[rng.gen_range(0..files.len())];
|
||||
f.read_exact_at(&mut buf, 0).unwrap();
|
||||
assert!(buf == SAMPLE);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(thread);
|
||||
}
|
||||
|
||||
for thread in threads {
|
||||
thread.join().unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -229,18 +229,17 @@ pub struct DecodedBkpBlock {
|
||||
pub blkno: u32,
|
||||
|
||||
/* copy of the fork_flags field from the XLogRecordBlockHeader */
|
||||
pub flags: u8,
|
||||
flags: u8,
|
||||
|
||||
/* Information on full-page image, if any */
|
||||
pub has_image: bool, /* has image, even for consistency checking */
|
||||
has_image: bool, /* has image, even for consistency checking */
|
||||
pub apply_image: bool, /* has image that should be restored */
|
||||
pub will_init: bool, /* record doesn't need previous page version to apply */
|
||||
//char *bkp_image;
|
||||
pub hole_offset: u16,
|
||||
pub hole_length: u16,
|
||||
pub bimg_offset: u32,
|
||||
pub bimg_len: u16,
|
||||
pub bimg_info: u8,
|
||||
hole_offset: u16,
|
||||
hole_length: u16,
|
||||
bimg_len: u16,
|
||||
bimg_info: u8,
|
||||
|
||||
/* Buffer holding the rmgr-specific data associated with this block */
|
||||
has_data: bool,
|
||||
@@ -860,19 +859,8 @@ pub fn decode_wal_record(record: Bytes) -> DecodedWALRecord {
|
||||
}
|
||||
|
||||
// 3. Decode blocks.
|
||||
let mut ptr = record.len() - buf.remaining();
|
||||
for blk in blocks.iter_mut() {
|
||||
if blk.has_image {
|
||||
blk.bimg_offset = ptr as u32;
|
||||
ptr += blk.bimg_len as usize;
|
||||
}
|
||||
if blk.has_data {
|
||||
ptr += blk.data_len as usize;
|
||||
}
|
||||
}
|
||||
// We don't need them, so just skip blocks_total_len bytes
|
||||
buf.advance(blocks_total_len as usize);
|
||||
assert_eq!(ptr, record.len() - buf.remaining());
|
||||
|
||||
let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
|
||||
|
||||
|
||||
@@ -8,8 +8,6 @@
|
||||
use crate::relish::*;
|
||||
use crate::restore_local_repo;
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::tenant_threads;
|
||||
use crate::waldecoder::*;
|
||||
use crate::PageServerConf;
|
||||
use anyhow::{bail, Error, Result};
|
||||
@@ -40,7 +38,6 @@ use zenith_utils::zid::ZTimelineId;
|
||||
struct WalReceiverEntry {
|
||||
wal_producer_connstr: String,
|
||||
wal_receiver_handle: Option<JoinHandle<()>>,
|
||||
tenantid: ZTenantId,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -68,23 +65,6 @@ pub fn stop_wal_receiver(timelineid: ZTimelineId) {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn drop_wal_receiver(timelineid: ZTimelineId, tenantid: ZTenantId) {
|
||||
let mut receivers = WAL_RECEIVERS.lock().unwrap();
|
||||
receivers.remove(&timelineid);
|
||||
|
||||
// Check if it was the last walreceiver of the tenant.
|
||||
// TODO now we store one WalReceiverEntry per timeline,
|
||||
// so this iterator looks a bit strange.
|
||||
for (_timelineid, entry) in receivers.iter() {
|
||||
if entry.tenantid == tenantid {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// When last walreceiver of the tenant is gone, change state to Idle
|
||||
tenant_mgr::set_tenant_state(tenantid, TenantState::Idle).unwrap();
|
||||
}
|
||||
|
||||
// Launch a new WAL receiver, or tell one that's running about change in connection string
|
||||
pub fn launch_wal_receiver(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -110,13 +90,8 @@ pub fn launch_wal_receiver(
|
||||
let receiver = WalReceiverEntry {
|
||||
wal_producer_connstr: wal_producer_connstr.into(),
|
||||
wal_receiver_handle: Some(wal_receiver_handle),
|
||||
tenantid,
|
||||
};
|
||||
receivers.insert(timelineid, receiver);
|
||||
|
||||
// Update tenant state and start tenant threads, if they are not running yet.
|
||||
tenant_mgr::set_tenant_state(tenantid, TenantState::Active).unwrap();
|
||||
tenant_threads::start_tenant_threads(conf, tenantid);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -139,15 +114,11 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
let _enter = info_span!("WAL receiver", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
info!("WAL receiver thread started");
|
||||
|
||||
let mut retry_count = 10;
|
||||
|
||||
//
|
||||
// Make a connection to the WAL safekeeper, or directly to the primary PostgreSQL server,
|
||||
// and start streaming WAL from it. If the connection is lost, keep retrying.
|
||||
// TODO How long should we retry in case of losing connection?
|
||||
// Should we retry at all or we can wait for the next callmemaybe request?
|
||||
//
|
||||
while !tenant_mgr::shutdown_requested() && retry_count > 0 {
|
||||
while !tenant_mgr::shutdown_requested() {
|
||||
// Look up the current WAL producer address
|
||||
let wal_producer_connstr = get_wal_producer_connstr(timelineid);
|
||||
|
||||
@@ -158,20 +129,10 @@ fn thread_main(conf: &'static PageServerConf, timelineid: ZTimelineId, tenantid:
|
||||
"WAL streaming connection failed ({}), retrying in 1 second",
|
||||
e
|
||||
);
|
||||
retry_count -= 1;
|
||||
sleep(Duration::from_secs(1));
|
||||
} else {
|
||||
info!(
|
||||
"walreceiver disconnected tenant {}, timelineid {}",
|
||||
tenantid, timelineid
|
||||
);
|
||||
break;
|
||||
}
|
||||
}
|
||||
info!("WAL streaming shut down");
|
||||
// Drop it from list of active WAL_RECEIVERS
|
||||
// so that next callmemaybe request launched a new thread
|
||||
drop_wal_receiver(timelineid, tenantid);
|
||||
debug!("WAL streaming shut down");
|
||||
}
|
||||
|
||||
fn walreceiver_main(
|
||||
@@ -321,24 +282,14 @@ fn walreceiver_main(
|
||||
};
|
||||
|
||||
if let Some(last_lsn) = status_update {
|
||||
// TODO: More thought should go into what values are sent here.
|
||||
let last_lsn = PgLsn::from(u64::from(last_lsn));
|
||||
|
||||
// The last LSN we processed. It is not guaranteed to survive pageserver crash.
|
||||
let write_lsn = last_lsn;
|
||||
// This value doesn't guarantee data durability, but it's ok.
|
||||
// In setup with WAL service, pageserver durability is guaranteed by safekeepers.
|
||||
// In setup without WAL service, we just don't care.
|
||||
let flush_lsn = write_lsn;
|
||||
// `disk_consistent_lsn` is the LSN at which page server guarantees persistence of all received data
|
||||
// Depending on the setup we recieve WAL directly from Compute Node or
|
||||
// from a WAL service.
|
||||
//
|
||||
// Senders use the feedback to determine if we are caught up:
|
||||
// - Safekeepers are free to remove WAL preceding `apply_lsn`,
|
||||
// as it will never be requested by this page server.
|
||||
// - Compute Node uses 'apply_lsn' to calculate a lag for back pressure mechanism
|
||||
// (delay WAL inserts to avoid lagging pageserver responses and WAL overflow).
|
||||
let apply_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
|
||||
// We are using disk consistent LSN as `write_lsn`, i.e. LSN at which page server
|
||||
// may guarantee persistence of all received data. Safekeeper is not free to remove
|
||||
// WAL preceding `write_lsn`: it should not be requested by this page server.
|
||||
let write_lsn = PgLsn::from(u64::from(timeline.get_disk_consistent_lsn()));
|
||||
let flush_lsn = last_lsn;
|
||||
let apply_lsn = PgLsn::from(0);
|
||||
let ts = SystemTime::now();
|
||||
const NO_REPLY: u8 = 0;
|
||||
physical_stream.standby_status_update(write_lsn, flush_lsn, apply_lsn, ts, NO_REPLY)?;
|
||||
@@ -349,7 +300,6 @@ fn walreceiver_main(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -22,24 +22,23 @@ use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use nix::poll::*;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
use std::fs::OpenOptions;
|
||||
use std::io::prelude::*;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::os::unix::io::AsRawFd;
|
||||
use std::io::Error;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Stdio;
|
||||
use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tokio::io::AsyncBufReadExt;
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use tokio::process::{ChildStdin, ChildStdout, Command};
|
||||
use tokio::time::timeout;
|
||||
use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::nonblock::set_nonblock;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::relish::*;
|
||||
@@ -54,8 +53,6 @@ use postgres_ffi::nonrelfile_utils::transaction_id_set_status;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::XLogRecord;
|
||||
|
||||
const N_WAL_REDO_PROCS: usize = 1;
|
||||
|
||||
///
|
||||
/// `RelTag` + block number (`blknum`) gives us a unique id of the page in the cluster.
|
||||
///
|
||||
@@ -142,8 +139,8 @@ pub struct PostgresRedoManager {
|
||||
tenantid: ZTenantId,
|
||||
conf: &'static PageServerConf,
|
||||
|
||||
round_robin: AtomicUsize,
|
||||
processes: [Mutex<Option<PostgresRedoProcess>>; N_WAL_REDO_PROCS],
|
||||
runtime: tokio::runtime::Runtime,
|
||||
process: Mutex<Option<PostgresRedoProcess>>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -213,29 +210,25 @@ impl WalRedoManager for PostgresRedoManager {
|
||||
end_time = Instant::now();
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(start_time).as_secs_f64());
|
||||
} else {
|
||||
let rr = self.round_robin.fetch_add(1, Ordering::Relaxed) % N_WAL_REDO_PROCS;
|
||||
let mut process_guard = self.processes[rr].lock().unwrap();
|
||||
let mut process_guard = self.process.lock().unwrap();
|
||||
let lock_time = Instant::now();
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
if process_guard.is_none() {
|
||||
let p = PostgresRedoProcess::launch(self.conf, &self.tenantid, rr)?;
|
||||
let p = self
|
||||
.runtime
|
||||
.block_on(PostgresRedoProcess::launch(self.conf, &self.tenantid))?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
|
||||
result = self.handle_apply_request_postgres(process, &request);
|
||||
result = self
|
||||
.runtime
|
||||
.block_on(self.handle_apply_request_postgres(process, &request));
|
||||
|
||||
WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
|
||||
end_time = Instant::now();
|
||||
WAL_REDO_TIME.observe(end_time.duration_since(lock_time).as_secs_f64());
|
||||
|
||||
// If something went wrong, don't try to reuse the process. Kill it, and
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
let process = process_guard.take().unwrap();
|
||||
process.kill();
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
@@ -247,19 +240,27 @@ impl PostgresRedoManager {
|
||||
/// Create a new PostgresRedoManager.
|
||||
///
|
||||
pub fn new(conf: &'static PageServerConf, tenantid: ZTenantId) -> PostgresRedoManager {
|
||||
// We block on waiting for requests on the walredo request channel, but
|
||||
// use async I/O to communicate with the child process. Initialize the
|
||||
// runtime for the async part.
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
// The actual process is launched lazily, on first request.
|
||||
PostgresRedoManager {
|
||||
runtime,
|
||||
tenantid,
|
||||
conf,
|
||||
round_robin: AtomicUsize::new(0),
|
||||
processes: [(); N_WAL_REDO_PROCS].map(|_| Mutex::new(None)),
|
||||
process: Mutex::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Process one request for WAL redo using wal-redo postgres
|
||||
///
|
||||
fn handle_apply_request_postgres(
|
||||
async fn handle_apply_request_postgres(
|
||||
&self,
|
||||
process: &mut PostgresRedoProcess,
|
||||
request: &WalRedoRequest,
|
||||
@@ -277,14 +278,14 @@ impl PostgresRedoManager {
|
||||
if let RelishTag::Relation(rel) = request.rel {
|
||||
// Relational WAL records are applied using wal-redo-postgres
|
||||
let buf_tag = BufferTag { rel, blknum };
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records);
|
||||
apply_result = process.apply_wal_records(buf_tag, base_img, records).await;
|
||||
|
||||
let duration = start.elapsed();
|
||||
|
||||
debug!(
|
||||
"postgres applied {} WAL records in {} us to reconstruct page image at LSN {}",
|
||||
"postgres applied {} WAL records in {} ms to reconstruct page image at LSN {}",
|
||||
nrecords,
|
||||
duration.as_micros(),
|
||||
duration.as_millis(),
|
||||
lsn
|
||||
);
|
||||
|
||||
@@ -468,27 +469,22 @@ impl PostgresRedoManager {
|
||||
/// Handle to the Postgres WAL redo process
|
||||
///
|
||||
struct PostgresRedoProcess {
|
||||
child: Child,
|
||||
stdin: ChildStdin,
|
||||
stdout: ChildStdout,
|
||||
stderr: ChildStderr,
|
||||
}
|
||||
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
fn launch(
|
||||
async fn launch(
|
||||
conf: &PageServerConf,
|
||||
tenantid: &ZTenantId,
|
||||
id: usize,
|
||||
) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = conf
|
||||
.tenant_path(tenantid)
|
||||
.join(format! {"wal-redo-datadir-{}", id});
|
||||
let datadir = conf.tenant_path(tenantid).join("wal-redo-datadir");
|
||||
|
||||
// Create empty data directory for wal-redo postgres, deleting old one first.
|
||||
if datadir.exists() {
|
||||
@@ -505,6 +501,7 @@ impl PostgresRedoProcess {
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir().to_str().unwrap())
|
||||
.output()
|
||||
.await
|
||||
.expect("failed to execute initdb");
|
||||
|
||||
if !initdb.status.success() {
|
||||
@@ -541,139 +538,102 @@ impl PostgresRedoProcess {
|
||||
datadir.display()
|
||||
);
|
||||
|
||||
let stdin = child.stdin.take().unwrap();
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
let stdin = child.stdin.take().expect("failed to open child's stdin");
|
||||
let stderr = child.stderr.take().expect("failed to open child's stderr");
|
||||
let stdout = child.stdout.take().expect("failed to open child's stdout");
|
||||
|
||||
set_nonblock(stdin.as_raw_fd())?;
|
||||
set_nonblock(stdout.as_raw_fd())?;
|
||||
set_nonblock(stderr.as_raw_fd())?;
|
||||
// This async block reads the child's stderr, and forwards it to the logger
|
||||
let f_stderr = async {
|
||||
let mut stderr_buffered = tokio::io::BufReader::new(stderr);
|
||||
|
||||
Ok(PostgresRedoProcess {
|
||||
child,
|
||||
stdin,
|
||||
stdout,
|
||||
stderr,
|
||||
})
|
||||
}
|
||||
let mut line = String::new();
|
||||
loop {
|
||||
let res = stderr_buffered.read_line(&mut line).await;
|
||||
if res.is_err() {
|
||||
debug!("could not convert line to utf-8");
|
||||
continue;
|
||||
}
|
||||
if res.unwrap() == 0 {
|
||||
break;
|
||||
}
|
||||
error!("wal-redo-postgres: {}", line.trim());
|
||||
line.clear();
|
||||
}
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
tokio::spawn(f_stderr);
|
||||
|
||||
fn kill(mut self) {
|
||||
let _ = self.child.kill();
|
||||
if let Ok(exit_status) = self.child.wait() {
|
||||
error!("wal-redo-postgres exited with code {}", exit_status);
|
||||
}
|
||||
drop(self);
|
||||
Ok(PostgresRedoProcess { stdin, stdout })
|
||||
}
|
||||
|
||||
//
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
fn apply_wal_records(
|
||||
async fn apply_wal_records(
|
||||
&mut self,
|
||||
tag: BufferTag,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, WALRecord)],
|
||||
) -> Result<Bytes, std::io::Error> {
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
//
|
||||
// This could be problematic if there are millions of records to replay,
|
||||
// but in practice the number of records is usually so small that it doesn't
|
||||
// matter, and it's better to keep this code simple.
|
||||
let mut writebuf: Vec<u8> = Vec::new();
|
||||
build_begin_redo_for_block_msg(tag, &mut writebuf);
|
||||
if let Some(img) = base_img {
|
||||
build_push_page_msg(tag, &img, &mut writebuf);
|
||||
}
|
||||
for (lsn, rec) in records.iter() {
|
||||
build_apply_record_msg(*lsn, &rec.rec, &mut writebuf);
|
||||
}
|
||||
build_get_page_msg(tag, &mut writebuf);
|
||||
WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
|
||||
|
||||
// The input is now in 'writebuf'. Do a blind write first, writing as much as
|
||||
// we can, before calling poll(). That skips one call to poll() if the stdin is
|
||||
// already available for writing, which it almost certainly is because the
|
||||
// process is idle.
|
||||
let mut nwrite = self.stdin.write(&writebuf)?;
|
||||
|
||||
// We expect the WAL redo process to respond with an 8k page image. We read it
|
||||
// into this buffer.
|
||||
let mut resultbuf = vec![0; pg_constants::BLCKSZ.into()];
|
||||
let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
|
||||
|
||||
// Prepare for calling poll()
|
||||
let mut pollfds = [
|
||||
PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
|
||||
PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
|
||||
PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
|
||||
];
|
||||
let stdout = &mut self.stdout;
|
||||
// Buffer the writes to avoid a lot of small syscalls.
|
||||
let mut stdin = tokio::io::BufWriter::new(&mut self.stdin);
|
||||
|
||||
// We do three things simultaneously: send the old base image and WAL records to
|
||||
// the child process's stdin, read the result from child's stdout, and forward any logging
|
||||
// information that the child writes to its stderr to the page server's log.
|
||||
while nresult < pg_constants::BLCKSZ.into() {
|
||||
// If we have more data to write, wake up if 'stdin' becomes writeable or
|
||||
// we have data to read. Otherwise only wake up if there's data to read.
|
||||
let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
|
||||
let n = nix::poll::poll(&mut pollfds[0..nfds], TIMEOUT.as_millis() as i32)?;
|
||||
|
||||
if n == 0 {
|
||||
return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
|
||||
//
|
||||
// 'f_stdin' handles writing the base image and WAL records to the child process.
|
||||
// 'f_stdout' below reads the result back. And 'f_stderr', which was spawned into the
|
||||
// tokio runtime in the 'launch' function already, forwards the logging.
|
||||
let f_stdin = async {
|
||||
// Send base image, if any. (If the record initializes the page, previous page
|
||||
// version is not needed.)
|
||||
timeout(
|
||||
TIMEOUT,
|
||||
stdin.write_all(&build_begin_redo_for_block_msg(tag)),
|
||||
)
|
||||
.await??;
|
||||
if let Some(img) = base_img {
|
||||
timeout(TIMEOUT, stdin.write_all(&build_push_page_msg(tag, &img))).await??;
|
||||
}
|
||||
|
||||
// If we have some messages in stderr, forward them to the log.
|
||||
let err_revents = pollfds[1].revents().unwrap();
|
||||
if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
let mut errbuf: [u8; 16384] = [0; 16384];
|
||||
let n = self.stderr.read(&mut errbuf)?;
|
||||
// Send WAL records.
|
||||
for (lsn, rec) in records.iter() {
|
||||
WAL_REDO_RECORD_COUNTER.inc();
|
||||
|
||||
// The message might not be split correctly into lines here. But this is
|
||||
// good enough, the important thing is to get the message to the log.
|
||||
if n > 0 {
|
||||
error!(
|
||||
"wal-redo-postgres: {}",
|
||||
String::from_utf8_lossy(&errbuf[0..n])
|
||||
);
|
||||
stdin
|
||||
.write_all(&build_apply_record_msg(*lsn, &rec.rec))
|
||||
.await?;
|
||||
|
||||
// To make sure we capture all log from the process if it fails, keep
|
||||
// reading from the stderr, before checking the stdout.
|
||||
continue;
|
||||
}
|
||||
} else if err_revents.contains(PollFlags::POLLHUP) {
|
||||
return Err(Error::new(
|
||||
ErrorKind::BrokenPipe,
|
||||
"WAL redo process closed its stderr unexpectedly",
|
||||
));
|
||||
//debug!("sent WAL record to wal redo postgres process ({:X}/{:X}",
|
||||
// r.lsn >> 32, r.lsn & 0xffff_ffff);
|
||||
}
|
||||
//debug!("sent {} WAL records to wal redo postgres process ({:X}/{:X}",
|
||||
// records.len(), lsn >> 32, lsn & 0xffff_ffff);
|
||||
|
||||
// If we have more data to write and 'stdin' is writeable, do write.
|
||||
if nwrite < writebuf.len() {
|
||||
let in_revents = pollfds[2].revents().unwrap();
|
||||
if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
|
||||
nwrite += self.stdin.write(&writebuf[nwrite..])?;
|
||||
} else if in_revents.contains(PollFlags::POLLHUP) {
|
||||
// We still have more data to write, but the process closed the pipe.
|
||||
return Err(Error::new(
|
||||
ErrorKind::BrokenPipe,
|
||||
"WAL redo process closed its stdin unexpectedly",
|
||||
));
|
||||
}
|
||||
}
|
||||
// Send GetPage command to get the result back
|
||||
timeout(TIMEOUT, stdin.write_all(&build_get_page_msg(tag))).await??;
|
||||
timeout(TIMEOUT, stdin.flush()).await??;
|
||||
//debug!("sent GetPage for {}", tag.blknum);
|
||||
Ok::<(), Error>(())
|
||||
};
|
||||
|
||||
// If we have some data in stdout, read it to the result buffer.
|
||||
let out_revents = pollfds[0].revents().unwrap();
|
||||
if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
|
||||
nresult += self.stdout.read(&mut resultbuf[nresult..])?;
|
||||
} else if out_revents.contains(PollFlags::POLLHUP) {
|
||||
return Err(Error::new(
|
||||
ErrorKind::BrokenPipe,
|
||||
"WAL redo process closed its stdout unexpectedly",
|
||||
));
|
||||
}
|
||||
}
|
||||
// Read back new page image
|
||||
let f_stdout = async {
|
||||
let mut buf = [0u8; 8192];
|
||||
|
||||
Ok(Bytes::from(resultbuf))
|
||||
timeout(TIMEOUT, stdout.read_exact(&mut buf)).await??;
|
||||
//debug!("got response for {}", tag.blknum);
|
||||
Ok::<[u8; 8192], Error>(buf)
|
||||
};
|
||||
|
||||
let res = tokio::try_join!(f_stdout, f_stdin)?;
|
||||
|
||||
let buf = res.0;
|
||||
|
||||
Ok::<Bytes, Error>(Bytes::from(std::vec::Vec::from(buf)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -681,42 +641,62 @@ impl PostgresRedoProcess {
|
||||
// process. See vendor/postgres/src/backend/tcop/zenith_wal_redo.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
fn build_begin_redo_for_block_msg(tag: BufferTag) -> Vec<u8> {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'B');
|
||||
buf.put_u32(len as u32);
|
||||
|
||||
tag.ser_into(buf)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8], buf: &mut Vec<u8>) {
|
||||
fn build_push_page_msg(tag: BufferTag, base_img: &[u8]) -> Vec<u8> {
|
||||
assert!(base_img.len() == 8192);
|
||||
|
||||
let len = 4 + 1 + 4 * 4 + base_img.len();
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'P');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
buf.put(base_img);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8], buf: &mut Vec<u8>) {
|
||||
fn build_apply_record_msg(endlsn: Lsn, rec: &[u8]) -> Vec<u8> {
|
||||
let len = 4 + 8 + rec.len();
|
||||
let mut buf: Vec<u8> = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'A');
|
||||
buf.put_u32(len as u32);
|
||||
buf.put_u64(endlsn.0);
|
||||
buf.put(rec);
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
fn build_get_page_msg(tag: BufferTag, buf: &mut Vec<u8>) {
|
||||
fn build_get_page_msg(tag: BufferTag) -> Vec<u8> {
|
||||
let len = 4 + 1 + 4 * 4;
|
||||
let mut buf = Vec::with_capacity(1 + len);
|
||||
|
||||
buf.put_u8(b'G');
|
||||
buf.put_u32(len as u32);
|
||||
tag.ser_into(buf)
|
||||
tag.ser_into(&mut buf)
|
||||
.expect("serialize BufferTag should always succeed");
|
||||
|
||||
debug_assert!(buf.len() == 1 + len);
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
@@ -43,9 +43,6 @@ pub const XLOG_SIZE_OF_XLOG_RECORD: usize = std::mem::size_of::<XLogRecord>();
|
||||
#[allow(clippy::identity_op)]
|
||||
pub const SIZE_OF_XLOG_RECORD_DATA_HEADER_SHORT: usize = 1 * 2;
|
||||
|
||||
// PG timeline is always 1, changing it doesn't have useful meaning in Zenith.
|
||||
pub const PG_TLI: u32 = 1;
|
||||
|
||||
pub type XLogRecPtr = u64;
|
||||
pub type TimeLineID = u32;
|
||||
pub type TimestampTz = i64;
|
||||
@@ -187,13 +184,8 @@ fn find_end_of_wal_segment(
|
||||
let xl_tot_len = LittleEndian::read_u32(&buf[page_offs..page_offs + 4]) as usize;
|
||||
if xl_tot_len == 0 {
|
||||
info!(
|
||||
"find_end_of_wal_segment reached zeros at {:?}, last records ends at {:?}",
|
||||
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size)),
|
||||
Lsn(XLogSegNoOffsetToRecPtr(
|
||||
segno,
|
||||
last_valid_rec_pos as u32,
|
||||
wal_seg_size
|
||||
))
|
||||
"find_end_of_wal_segment reached zeros at {:?}",
|
||||
Lsn(XLogSegNoOffsetToRecPtr(segno, offs as u32, wal_seg_size))
|
||||
);
|
||||
break; // zeros, reached the end
|
||||
}
|
||||
@@ -308,17 +300,12 @@ pub fn find_end_of_wal(
|
||||
high_segno,
|
||||
);
|
||||
}
|
||||
let start_offset = if start_lsn.segment_number(wal_seg_size) == high_segno {
|
||||
start_lsn.segment_offset(wal_seg_size)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
high_offs = find_end_of_wal_segment(
|
||||
data_dir,
|
||||
high_segno,
|
||||
high_tli,
|
||||
wal_seg_size,
|
||||
start_offset,
|
||||
start_lsn.segment_offset(wal_seg_size),
|
||||
)?;
|
||||
}
|
||||
let high_ptr = XLogSegNoOffsetToRecPtr(high_segno, high_offs, wal_seg_size);
|
||||
@@ -434,7 +421,7 @@ pub fn generate_wal_segment(segno: u64, system_id: u64) -> Bytes {
|
||||
XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: pg_constants::XLP_LONG_HEADER,
|
||||
xlp_tli: PG_TLI,
|
||||
xlp_tli: 1, // FIXME: always use Postgres timeline 1
|
||||
xlp_pageaddr: pageaddr,
|
||||
xlp_rem_len: 0,
|
||||
..Default::default() // Put 0 in padding fields.
|
||||
|
||||
@@ -37,27 +37,20 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
|
||||
return cmd
|
||||
|
||||
|
||||
def yapf(fix_inplace: bool) -> str:
|
||||
cmd = "pipenv run yapf --recursive"
|
||||
if fix_inplace:
|
||||
cmd += " --in-place"
|
||||
else:
|
||||
cmd += " --diff"
|
||||
return cmd
|
||||
|
||||
|
||||
def mypy() -> str:
|
||||
return "pipenv run mypy"
|
||||
|
||||
|
||||
def get_commit_files() -> List[str]:
|
||||
files = subprocess.check_output("git diff --cached --name-only --diff-filter=ACM".split())
|
||||
files = subprocess.check_output(
|
||||
"git diff --cached --name-only --diff-filter=ACM".split()
|
||||
)
|
||||
return files.decode().splitlines()
|
||||
|
||||
|
||||
def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False):
|
||||
def check(
|
||||
name: str, suffix: str, cmd: str, changed_files: List[str], no_color: bool = False
|
||||
):
|
||||
print(f"Checking: {name} ", end="")
|
||||
applicable_files = list(filter(lambda fname: fname.strip().endswith(suffix), changed_files))
|
||||
applicable_files = list(
|
||||
filter(lambda fname: fname.strip().endswith(suffix), changed_files)
|
||||
)
|
||||
if not applicable_files:
|
||||
print(colorify("[NOT APPLICABLE]", Color.CYAN, no_color))
|
||||
return
|
||||
@@ -66,14 +59,7 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
|
||||
res = subprocess.run(cmd.split(), capture_output=True)
|
||||
if res.returncode != 0:
|
||||
print(colorify("[FAILED]", Color.RED, no_color))
|
||||
if name == "mypy":
|
||||
print("Please inspect the output below and fix type mismatches.")
|
||||
else:
|
||||
print("Please inspect the output below and run make fmt to fix automatically.")
|
||||
if suffix == ".py":
|
||||
print("If the output is empty, ensure that you've installed Python tooling by\n"
|
||||
"running 'pipenv install --dev' in the current directory (no root needed)")
|
||||
print()
|
||||
print("Please inspect the output below and run make fmt to fix automatically\n")
|
||||
print(res.stdout.decode())
|
||||
exit(1)
|
||||
|
||||
@@ -82,11 +68,12 @@ def check(name: str, suffix: str, cmd: str, changed_files: List[str], no_color:
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--fix-inplace", action="store_true", help="apply fixes inplace")
|
||||
parser.add_argument("--no-color",
|
||||
action="store_true",
|
||||
help="disable colored output",
|
||||
default=not sys.stdout.isatty())
|
||||
parser.add_argument(
|
||||
"--fix-inplace", action="store_true", help="apply fixes inplace"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-color", action="store_true", help="disable colored output", default=not sys.stdout.isatty()
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
files = get_commit_files()
|
||||
@@ -100,17 +87,3 @@ if __name__ == "__main__":
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
check(
|
||||
name="yapf",
|
||||
suffix=".py",
|
||||
cmd=yapf(fix_inplace=args.fix_inplace),
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
check(
|
||||
name="mypy",
|
||||
suffix=".py",
|
||||
cmd=mypy(),
|
||||
changed_files=files,
|
||||
no_color=args.no_color,
|
||||
)
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
use anyhow::{anyhow, bail, Context};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::net::{SocketAddr, ToSocketAddrs};
|
||||
|
||||
use crate::state::ProxyWaiters;
|
||||
pub struct CPlaneApi {
|
||||
auth_endpoint: &'static str,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Default)]
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
pub struct DatabaseInfo {
|
||||
pub host: String,
|
||||
pub port: u16,
|
||||
@@ -14,21 +16,20 @@ pub struct DatabaseInfo {
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
#[serde(untagged)]
|
||||
enum ProxyAuthResponse {
|
||||
Ready { conn_info: DatabaseInfo },
|
||||
Error { error: String },
|
||||
NotReady { ready: bool }, // TODO: get rid of `ready`
|
||||
pub struct ProxyAuthResult {
|
||||
pub ready: bool,
|
||||
pub error: Option<String>,
|
||||
pub conn_info: Option<DatabaseInfo>,
|
||||
}
|
||||
|
||||
impl DatabaseInfo {
|
||||
pub fn socket_addr(&self) -> anyhow::Result<SocketAddr> {
|
||||
pub fn socket_addr(&self) -> Result<SocketAddr> {
|
||||
let host_port = format!("{}:{}", self.host, self.port);
|
||||
host_port
|
||||
.to_socket_addrs()
|
||||
.with_context(|| format!("cannot resolve {} to SocketAddr", host_port))?
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("cannot resolve at least one SocketAddr"))
|
||||
.ok_or_else(|| anyhow::Error::msg("cannot resolve at least one SocketAddr"))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,21 +51,11 @@ impl From<DatabaseInfo> for tokio_postgres::Config {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CPlaneApi<'a> {
|
||||
auth_endpoint: &'a str,
|
||||
waiters: &'a ProxyWaiters,
|
||||
}
|
||||
|
||||
impl<'a> CPlaneApi<'a> {
|
||||
pub fn new(auth_endpoint: &'a str, waiters: &'a ProxyWaiters) -> Self {
|
||||
Self {
|
||||
auth_endpoint,
|
||||
waiters,
|
||||
}
|
||||
impl CPlaneApi {
|
||||
pub fn new(auth_endpoint: &'static str) -> CPlaneApi {
|
||||
CPlaneApi { auth_endpoint }
|
||||
}
|
||||
}
|
||||
|
||||
impl CPlaneApi<'_> {
|
||||
pub fn authenticate_proxy_request(
|
||||
&self,
|
||||
user: &str,
|
||||
@@ -72,7 +63,7 @@ impl CPlaneApi<'_> {
|
||||
md5_response: &[u8],
|
||||
salt: &[u8; 4],
|
||||
psql_session_id: &str,
|
||||
) -> anyhow::Result<DatabaseInfo> {
|
||||
) -> Result<ProxyAuthResult> {
|
||||
let mut url = reqwest::Url::parse(self.auth_endpoint)?;
|
||||
url.query_pairs_mut()
|
||||
.append_pair("login", user)
|
||||
@@ -81,59 +72,17 @@ impl CPlaneApi<'_> {
|
||||
.append_pair("salt", &hex::encode(salt))
|
||||
.append_pair("psql_session_id", psql_session_id);
|
||||
|
||||
let waiter = self.waiters.register(psql_session_id.to_owned());
|
||||
println!("cplane request: {}", url.as_str());
|
||||
|
||||
println!("cplane request: {}", url);
|
||||
let resp = reqwest::blocking::get(url)?;
|
||||
if !resp.status().is_success() {
|
||||
bail!("Auth failed: {}", resp.status())
|
||||
}
|
||||
|
||||
let auth_info: ProxyAuthResponse = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got auth info: #{:?}", auth_info);
|
||||
if resp.status().is_success() {
|
||||
let auth_info: ProxyAuthResult = serde_json::from_str(resp.text()?.as_str())?;
|
||||
println!("got auth info: #{:?}", auth_info);
|
||||
|
||||
use ProxyAuthResponse::*;
|
||||
match auth_info {
|
||||
Ready { conn_info } => Ok(conn_info),
|
||||
Error { error } => bail!(error),
|
||||
NotReady { .. } => waiter.wait()?.map_err(|e| anyhow!(e)),
|
||||
Ok(auth_info)
|
||||
} else {
|
||||
bail!("Auth failed")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use serde_json::json;
|
||||
|
||||
#[test]
|
||||
fn test_proxy_auth_response() {
|
||||
// Ready
|
||||
let auth: ProxyAuthResponse = serde_json::from_value(json!({
|
||||
"ready": true,
|
||||
"conn_info": DatabaseInfo::default(),
|
||||
}))
|
||||
.unwrap();
|
||||
assert!(matches!(
|
||||
auth,
|
||||
ProxyAuthResponse::Ready {
|
||||
conn_info: DatabaseInfo { .. }
|
||||
}
|
||||
));
|
||||
|
||||
// Error
|
||||
let auth: ProxyAuthResponse = serde_json::from_value(json!({
|
||||
"ready": false,
|
||||
"error": "too bad, so sad",
|
||||
}))
|
||||
.unwrap();
|
||||
assert!(matches!(auth, ProxyAuthResponse::Error { .. }));
|
||||
|
||||
// NotReady
|
||||
let auth: ProxyAuthResponse = serde_json::from_value(json!({
|
||||
"ready": false,
|
||||
}))
|
||||
.unwrap();
|
||||
assert!(matches!(auth, ProxyAuthResponse::NotReady { .. }));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,21 +5,78 @@
|
||||
/// (control plane API in our case) and can create new databases and accounts
|
||||
/// in somewhat transparent manner (again via communication with control plane API).
|
||||
///
|
||||
use anyhow::bail;
|
||||
use clap::{App, Arg};
|
||||
use state::{ProxyConfig, ProxyState};
|
||||
use std::thread;
|
||||
use zenith_utils::{tcp_listener, GIT_VERSION};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
net::{SocketAddr, TcpListener},
|
||||
sync::{mpsc, Arc, Mutex},
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context};
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
|
||||
use cplane_api::DatabaseInfo;
|
||||
use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
|
||||
|
||||
mod cplane_api;
|
||||
mod mgmt;
|
||||
mod proxy;
|
||||
mod state;
|
||||
mod waiters;
|
||||
|
||||
pub struct ProxyConf {
|
||||
/// main entrypoint for users to connect to
|
||||
pub proxy_address: SocketAddr,
|
||||
|
||||
/// http management endpoint. Upon user account creation control plane
|
||||
/// will notify us here, so that we can 'unfreeze' user session.
|
||||
pub mgmt_address: SocketAddr,
|
||||
|
||||
/// send unauthenticated users to this URI
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub auth_endpoint: String,
|
||||
|
||||
pub ssl_config: Option<Arc<ServerConfig>>,
|
||||
}
|
||||
|
||||
pub struct ProxyState {
|
||||
pub conf: ProxyConf,
|
||||
pub waiters: Mutex<HashMap<String, mpsc::Sender<anyhow::Result<DatabaseInfo>>>>,
|
||||
}
|
||||
|
||||
fn configure_ssl(arg_matches: &ArgMatches) -> anyhow::Result<Option<Arc<ServerConfig>>> {
|
||||
let (key_path, cert_path) = match (
|
||||
arg_matches.value_of("ssl-key"),
|
||||
arg_matches.value_of("ssl-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => (key_path, cert_path),
|
||||
(None, None) => return Ok(None),
|
||||
_ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
|
||||
};
|
||||
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("SSL key file")?;
|
||||
let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS keys"))?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().unwrap()
|
||||
};
|
||||
|
||||
let cert_chain = {
|
||||
let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
|
||||
pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS certificates"))?
|
||||
};
|
||||
|
||||
let mut config = ServerConfig::new(NoClientAuth::new());
|
||||
config.set_single_cert(cert_chain, key)?;
|
||||
config.versions = vec![ProtocolVersion::TLSv1_3];
|
||||
|
||||
Ok(Some(Arc::new(config)))
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("Zenith proxy/router")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::with_name("proxy")
|
||||
.short("p")
|
||||
@@ -68,34 +125,25 @@ fn main() -> anyhow::Result<()> {
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let ssl_config = match (
|
||||
arg_matches.value_of("ssl-key"),
|
||||
arg_matches.value_of("ssl-cert"),
|
||||
) {
|
||||
(Some(key_path), Some(cert_path)) => {
|
||||
Some(crate::state::configure_ssl(key_path, cert_path)?)
|
||||
}
|
||||
(None, None) => None,
|
||||
_ => bail!("either both or neither ssl-key and ssl-cert must be specified"),
|
||||
};
|
||||
|
||||
let config = ProxyConfig {
|
||||
let conf = ProxyConf {
|
||||
proxy_address: arg_matches.value_of("proxy").unwrap().parse()?,
|
||||
mgmt_address: arg_matches.value_of("mgmt").unwrap().parse()?,
|
||||
redirect_uri: arg_matches.value_of("uri").unwrap().parse()?,
|
||||
auth_endpoint: arg_matches.value_of("auth-endpoint").unwrap().parse()?,
|
||||
ssl_config,
|
||||
ssl_config: configure_ssl(&arg_matches)?,
|
||||
};
|
||||
let state: &ProxyState = Box::leak(Box::new(ProxyState::new(config)));
|
||||
|
||||
println!("Version: {}", GIT_VERSION);
|
||||
let state = ProxyState {
|
||||
conf,
|
||||
waiters: Mutex::new(HashMap::new()),
|
||||
};
|
||||
let state: &'static ProxyState = Box::leak(Box::new(state));
|
||||
|
||||
// Check that we can bind to address before further initialization
|
||||
println!("Starting proxy on {}", state.conf.proxy_address);
|
||||
let pageserver_listener = tcp_listener::bind(state.conf.proxy_address)?;
|
||||
let pageserver_listener = TcpListener::bind(state.conf.proxy_address)?;
|
||||
|
||||
println!("Starting mgmt on {}", state.conf.mgmt_address);
|
||||
let mgmt_listener = tcp_listener::bind(state.conf.mgmt_address)?;
|
||||
let mgmt_listener = TcpListener::bind(state.conf.mgmt_address)?;
|
||||
|
||||
let threads = [
|
||||
// Spawn a thread to listen for connections. It will spawn further threads
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::{
|
||||
thread,
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use bytes::Bytes;
|
||||
use serde::Deserialize;
|
||||
use zenith_utils::{
|
||||
@@ -24,23 +25,22 @@ pub fn thread_main(state: &'static ProxyState, listener: TcpListener) -> anyhow:
|
||||
socket.set_nodelay(true).unwrap();
|
||||
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = handle_connection(state, socket) {
|
||||
if let Err(err) = mgmt_conn_main(state, socket) {
|
||||
println!("error: {}", err);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_connection(state: &ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
pub fn mgmt_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let mut conn_handler = MgmtHandler { state };
|
||||
let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None, true)?;
|
||||
pgbackend.run(&mut conn_handler)
|
||||
}
|
||||
|
||||
struct MgmtHandler<'a> {
|
||||
state: &'a ProxyState,
|
||||
struct MgmtHandler {
|
||||
state: &'static ProxyState,
|
||||
}
|
||||
|
||||
/// Serialized examples:
|
||||
// {
|
||||
// "session_id": "71d6d03e6d93d99a",
|
||||
@@ -64,18 +64,18 @@ struct MgmtHandler<'a> {
|
||||
// // to test manually by sending a query to mgmt interface:
|
||||
// psql -h 127.0.0.1 -p 9999 -c '{"session_id":"4f10dde522e14739","result":{"Success":{"host":"127.0.0.1","port":5432,"dbname":"stas","user":"stas","password":"stas"}}}'
|
||||
#[derive(Deserialize)]
|
||||
struct PsqlSessionResponse {
|
||||
pub struct PsqlSessionResponse {
|
||||
session_id: String,
|
||||
result: PsqlSessionResult,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
enum PsqlSessionResult {
|
||||
pub enum PsqlSessionResult {
|
||||
Success(DatabaseInfo),
|
||||
Failure(String),
|
||||
}
|
||||
|
||||
impl postgres_backend::Handler for MgmtHandler<'_> {
|
||||
impl postgres_backend::Handler for MgmtHandler {
|
||||
fn process_query(
|
||||
&mut self,
|
||||
pgb: &mut PostgresBackend,
|
||||
@@ -96,26 +96,32 @@ fn try_process_query(
|
||||
query_string: Bytes,
|
||||
) -> anyhow::Result<()> {
|
||||
let query_string = query_from_cstring(query_string);
|
||||
|
||||
println!("Got mgmt query: '{}'", std::str::from_utf8(&query_string)?);
|
||||
|
||||
let resp: PsqlSessionResponse = serde_json::from_slice(&query_string)?;
|
||||
|
||||
use PsqlSessionResult::*;
|
||||
let msg = match resp.result {
|
||||
Success(db_info) => Ok(db_info),
|
||||
Failure(message) => Err(message),
|
||||
};
|
||||
let waiters = mgmt.state.waiters.lock().unwrap();
|
||||
|
||||
let sender = waiters
|
||||
.get(&resp.session_id)
|
||||
.ok_or_else(|| anyhow::Error::msg("psql_session_id is not found"))?;
|
||||
|
||||
match resp.result {
|
||||
PsqlSessionResult::Success(db_info) => {
|
||||
sender.send(Ok(db_info))?;
|
||||
|
||||
match mgmt.state.waiters.notify(&resp.session_id, msg) {
|
||||
Ok(()) => {
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[Some(b"ok")]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
pgb.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => {
|
||||
pgb.write_message(&BeMessage::ErrorResponse(e.to_string()))?;
|
||||
|
||||
PsqlSessionResult::Failure(message) => {
|
||||
sender.send(Err(anyhow::Error::msg(message.clone())))?;
|
||||
|
||||
bail!("psql session request failed: {}", message)
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
use crate::cplane_api::{CPlaneApi, DatabaseInfo};
|
||||
use crate::cplane_api::CPlaneApi;
|
||||
use crate::cplane_api::DatabaseInfo;
|
||||
use crate::ProxyState;
|
||||
use anyhow::{anyhow, bail};
|
||||
use std::net::TcpStream;
|
||||
use std::{io, thread};
|
||||
|
||||
use anyhow::bail;
|
||||
use tokio_postgres::NoTls;
|
||||
use zenith_utils::postgres_backend::{self, PostgresBackend, ProtoState, Stream};
|
||||
use zenith_utils::pq_proto::{BeMessage as Be, FeMessage as Fe, *};
|
||||
|
||||
use rand::Rng;
|
||||
use std::{io, sync::mpsc::channel, thread};
|
||||
use zenith_utils::postgres_backend::Stream;
|
||||
use zenith_utils::postgres_backend::{PostgresBackend, ProtoState};
|
||||
use zenith_utils::pq_proto::*;
|
||||
use zenith_utils::sock_split::{ReadStream, WriteStream};
|
||||
use zenith_utils::{postgres_backend, pq_proto::BeMessage};
|
||||
|
||||
///
|
||||
/// Main proxy listener loop.
|
||||
@@ -32,213 +37,256 @@ pub fn thread_main(
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: clean up fields
|
||||
// XXX: clean up fields
|
||||
struct ProxyConnection {
|
||||
state: &'static ProxyState,
|
||||
psql_session_id: String,
|
||||
|
||||
cplane: CPlaneApi,
|
||||
|
||||
user: String,
|
||||
database: String,
|
||||
|
||||
pgb: PostgresBackend,
|
||||
md5_salt: [u8; 4],
|
||||
|
||||
psql_session_id: String,
|
||||
}
|
||||
|
||||
pub fn proxy_conn_main(state: &'static ProxyState, socket: TcpStream) -> anyhow::Result<()> {
|
||||
let conn = ProxyConnection {
|
||||
pub fn proxy_conn_main(
|
||||
state: &'static ProxyState,
|
||||
socket: std::net::TcpStream,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut conn = ProxyConnection {
|
||||
state,
|
||||
psql_session_id: hex::encode(rand::random::<[u8; 8]>()),
|
||||
cplane: CPlaneApi::new(&state.conf.auth_endpoint),
|
||||
user: "".into(),
|
||||
database: "".into(),
|
||||
pgb: PostgresBackend::new(
|
||||
socket,
|
||||
postgres_backend::AuthType::MD5,
|
||||
state.conf.ssl_config.clone(),
|
||||
false,
|
||||
)?,
|
||||
md5_salt: [0u8; 4],
|
||||
psql_session_id: "".into(),
|
||||
};
|
||||
|
||||
let (client, server) = conn.handle_client()?;
|
||||
// Check StartupMessage
|
||||
// This will set conn.existing_user and we can decide on next actions
|
||||
conn.handle_startup()?;
|
||||
|
||||
let server = zenith_utils::sock_split::BidiStream::from_tcp(server);
|
||||
let mut psql_session_id_buf = [0u8; 8];
|
||||
rand::thread_rng().fill(&mut psql_session_id_buf);
|
||||
conn.psql_session_id = hex::encode(psql_session_id_buf);
|
||||
|
||||
let client = match client {
|
||||
Stream::Bidirectional(bidi_stream) => bidi_stream,
|
||||
_ => panic!("invalid stream type"),
|
||||
// both scenarious here should end up producing database connection string
|
||||
let conn_info = if conn.is_existing_user() {
|
||||
conn.handle_existing_user()?
|
||||
} else {
|
||||
conn.handle_new_user()?
|
||||
};
|
||||
|
||||
proxy(client.split(), server.split())
|
||||
// XXX: move that inside handle_new_user/handle_existing_user to be able to
|
||||
// report wrong connection error.
|
||||
proxy_pass(conn.pgb, conn_info)
|
||||
}
|
||||
|
||||
impl ProxyConnection {
|
||||
fn handle_client(mut self) -> anyhow::Result<(Stream, TcpStream)> {
|
||||
let mut authenticate = || {
|
||||
let (username, dbname) = self.handle_startup()?;
|
||||
|
||||
// Both scenarios here should end up producing database credentials
|
||||
if username.ends_with("@zenith") {
|
||||
self.handle_existing_user(&username, &dbname)
|
||||
} else {
|
||||
self.handle_new_user()
|
||||
}
|
||||
};
|
||||
|
||||
let conn = match authenticate() {
|
||||
Ok(db_info) => connect_to_db(db_info),
|
||||
Err(e) => {
|
||||
// Report the error to the client
|
||||
self.pgb.write_message(&Be::ErrorResponse(e.to_string()))?;
|
||||
bail!("failed to handle client: {:?}", e);
|
||||
}
|
||||
};
|
||||
|
||||
// We'll get rid of this once migration to async is complete
|
||||
let (pg_version, db_stream) = {
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let (pg_version, stream) = runtime.block_on(conn)?;
|
||||
let stream = stream.into_std()?;
|
||||
stream.set_nonblocking(false)?;
|
||||
|
||||
(pg_version, stream)
|
||||
};
|
||||
|
||||
// Let the client send new requests
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus(
|
||||
BeParameterStatusMessage::ServerVersion(&pg_version),
|
||||
))?
|
||||
.write_message(&Be::ReadyForQuery)?;
|
||||
|
||||
Ok((self.pgb.into_stream(), db_stream))
|
||||
fn is_existing_user(&self) -> bool {
|
||||
self.user.ends_with("@zenith")
|
||||
}
|
||||
|
||||
fn handle_startup(&mut self) -> anyhow::Result<(String, String)> {
|
||||
let have_tls = self.pgb.tls_config.is_some();
|
||||
fn handle_startup(&mut self) -> anyhow::Result<()> {
|
||||
let mut encrypted = false;
|
||||
|
||||
loop {
|
||||
let mut msg = match self.pgb.read_message()? {
|
||||
Some(Fe::StartupMessage(msg)) => msg,
|
||||
None => bail!("connection is lost"),
|
||||
bad => bail!("unexpected message type: {:?}", bad),
|
||||
};
|
||||
println!("got message: {:?}", msg);
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
match msg {
|
||||
Some(FeMessage::StartupMessage(m)) => {
|
||||
println!("got startup message {:?}", m);
|
||||
|
||||
match msg.kind {
|
||||
StartupRequestCode::NegotiateGss => {
|
||||
self.pgb.write_message(&Be::EncryptionResponse(false))?;
|
||||
}
|
||||
StartupRequestCode::NegotiateSsl => {
|
||||
self.pgb.write_message(&Be::EncryptionResponse(have_tls))?;
|
||||
if have_tls {
|
||||
self.pgb.start_tls()?;
|
||||
encrypted = true;
|
||||
match m.kind {
|
||||
StartupRequestCode::NegotiateGss => {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::EncryptionResponse(false))?;
|
||||
}
|
||||
StartupRequestCode::NegotiateSsl => {
|
||||
println!("SSL requested");
|
||||
if self.pgb.tls_config.is_some() {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::EncryptionResponse(true))?;
|
||||
self.pgb.start_tls()?;
|
||||
encrypted = true;
|
||||
} else {
|
||||
self.pgb
|
||||
.write_message(&BeMessage::EncryptionResponse(false))?;
|
||||
}
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
if self.state.conf.ssl_config.is_some() && !encrypted {
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(
|
||||
"must connect with TLS".to_string(),
|
||||
))?;
|
||||
bail!("client did not connect with TLS");
|
||||
}
|
||||
self.user = m
|
||||
.params
|
||||
.get("user")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("user is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
self.database = m
|
||||
.params
|
||||
.get("database")
|
||||
.ok_or_else(|| {
|
||||
anyhow::Error::msg("database is required in startup packet")
|
||||
})?
|
||||
.into();
|
||||
|
||||
break;
|
||||
}
|
||||
StartupRequestCode::Cancel => break,
|
||||
}
|
||||
}
|
||||
StartupRequestCode::Normal => {
|
||||
if have_tls && !encrypted {
|
||||
bail!("must connect with TLS");
|
||||
}
|
||||
None => {
|
||||
bail!("connection closed")
|
||||
}
|
||||
unexpected => {
|
||||
bail!("unexpected message type : {:?}", unexpected)
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
let mut get_param = |key| {
|
||||
msg.params
|
||||
.remove(key)
|
||||
.ok_or_else(|| anyhow!("{} is missing in startup packet", key))
|
||||
// Wait for proxy kick form the console with conninfo
|
||||
fn wait_for_conninfo(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
let (tx, rx) = channel::<anyhow::Result<DatabaseInfo>>();
|
||||
let _ = self
|
||||
.state
|
||||
.waiters
|
||||
.lock()
|
||||
.unwrap()
|
||||
.insert(self.psql_session_id.clone(), tx);
|
||||
|
||||
// Wait for web console response
|
||||
// TODO: respond with error to client
|
||||
rx.recv()?
|
||||
}
|
||||
|
||||
fn handle_existing_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
// ask password
|
||||
rand::thread_rng().fill(&mut self.md5_salt);
|
||||
|
||||
self.pgb
|
||||
.write_message(&BeMessage::AuthenticationMD5Password(&self.md5_salt))?;
|
||||
self.pgb.state = ProtoState::Authentication; // XXX
|
||||
|
||||
// check password
|
||||
println!("handle_existing_user");
|
||||
let msg = self.pgb.read_message()?;
|
||||
println!("got message {:?}", msg);
|
||||
if let Some(FeMessage::PasswordMessage(m)) = msg {
|
||||
println!("got password message '{:?}'", m);
|
||||
|
||||
assert!(self.is_existing_user());
|
||||
|
||||
let (_trailing_null, md5_response) = m
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow::Error::msg("unexpected password message"))?;
|
||||
|
||||
match self.cplane.authenticate_proxy_request(
|
||||
self.user.as_str(),
|
||||
self.database.as_str(),
|
||||
md5_response,
|
||||
&self.md5_salt,
|
||||
&self.psql_session_id,
|
||||
) {
|
||||
Err(e) => {
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(format!(
|
||||
"cannot authenticate proxy: {}",
|
||||
e
|
||||
)))?;
|
||||
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
|
||||
Ok(auth_info) => {
|
||||
let conn_info = if auth_info.ready {
|
||||
// Cluster is ready, so just take `conn_info` and respond to the client.
|
||||
auth_info
|
||||
.conn_info
|
||||
.expect("conn_info should be provided with ready cluster")
|
||||
} else {
|
||||
match auth_info.error {
|
||||
Some(e) => {
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(format!(
|
||||
"cannot authenticate proxy: {}",
|
||||
e
|
||||
)))?;
|
||||
|
||||
bail!("auth failed: {}", e);
|
||||
}
|
||||
None => {
|
||||
// Cluster exists, but isn't active, await its start and proxy kick
|
||||
// with `conn_info`.
|
||||
self.wait_for_conninfo()?
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
return Ok((get_param("user")?, get_param("database")?));
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(conn_info)
|
||||
}
|
||||
// TODO: implement proper stmt cancellation
|
||||
StartupRequestCode::Cancel => bail!("query cancellation is not supported"),
|
||||
}
|
||||
} else {
|
||||
bail!("protocol violation");
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_existing_user(&mut self, user: &str, db: &str) -> anyhow::Result<DatabaseInfo> {
|
||||
let md5_salt = rand::random::<[u8; 4]>();
|
||||
|
||||
// Ask password
|
||||
self.pgb
|
||||
.write_message(&Be::AuthenticationMD5Password(&md5_salt))?;
|
||||
self.pgb.state = ProtoState::Authentication; // XXX
|
||||
|
||||
// Check password
|
||||
let msg = match self.pgb.read_message()? {
|
||||
Some(Fe::PasswordMessage(msg)) => msg,
|
||||
None => bail!("connection is lost"),
|
||||
bad => bail!("unexpected message type: {:?}", bad),
|
||||
};
|
||||
println!("got message: {:?}", msg);
|
||||
|
||||
let (_trailing_null, md5_response) = msg
|
||||
.split_last()
|
||||
.ok_or_else(|| anyhow!("unexpected password message"))?;
|
||||
|
||||
let cplane = CPlaneApi::new(&self.state.conf.auth_endpoint, &self.state.waiters);
|
||||
let db_info = cplane.authenticate_proxy_request(
|
||||
user,
|
||||
db,
|
||||
md5_response,
|
||||
&md5_salt,
|
||||
&self.psql_session_id,
|
||||
)?;
|
||||
|
||||
self.pgb
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&BeParameterStatusMessage::encoding())?;
|
||||
|
||||
Ok(db_info)
|
||||
}
|
||||
|
||||
fn handle_new_user(&mut self) -> anyhow::Result<DatabaseInfo> {
|
||||
let greeting = hello_message(&self.state.conf.redirect_uri, &self.psql_session_id);
|
||||
let hello_message = format!("☀️ Welcome to Zenith!
|
||||
|
||||
// First, register this session
|
||||
let waiter = self.state.waiters.register(self.psql_session_id.clone());
|
||||
To proceed with database creation, open the following link:
|
||||
|
||||
// Give user a URL to spawn a new database
|
||||
self.pgb
|
||||
.write_message_noflush(&Be::AuthenticationOk)?
|
||||
.write_message_noflush(&BeParameterStatusMessage::encoding())?
|
||||
.write_message(&Be::NoticeResponse(greeting))?;
|
||||
{redirect_uri}{sess_id}
|
||||
|
||||
// Wait for web console response
|
||||
let db_info = waiter.wait()?.map_err(|e| anyhow!(e))?;
|
||||
It needs to be done once and we will send you '.pgpass' file, which will allow you to access or create
|
||||
databases without opening the browser.
|
||||
|
||||
", redirect_uri = self.state.conf.redirect_uri, sess_id = self.psql_session_id);
|
||||
|
||||
self.pgb
|
||||
.write_message_noflush(&Be::NoticeResponse("Connecting to database.".into()))?;
|
||||
.write_message_noflush(&BeMessage::AuthenticationOk)?;
|
||||
self.pgb
|
||||
.write_message_noflush(&BeMessage::ParameterStatus)?;
|
||||
self.pgb
|
||||
.write_message(&BeMessage::NoticeResponse(hello_message))?;
|
||||
|
||||
Ok(db_info)
|
||||
// We requested the DB creation from the console. Now wait for conninfo
|
||||
let conn_info = self.wait_for_conninfo()?;
|
||||
|
||||
self.pgb.write_message_noflush(&BeMessage::NoticeResponse(
|
||||
"Connecting to database.".to_string(),
|
||||
))?;
|
||||
self.pgb.write_message(&BeMessage::ReadyForQuery)?;
|
||||
|
||||
Ok(conn_info)
|
||||
}
|
||||
}
|
||||
|
||||
fn hello_message(redirect_uri: &str, session_id: &str) -> String {
|
||||
format!(
|
||||
concat![
|
||||
"☀️ Welcome to Zenith!\n",
|
||||
"To proceed with database creation, open the following link:\n\n",
|
||||
" {redirect_uri}{session_id}\n\n",
|
||||
"It needs to be done once and we will send you '.pgpass' file,\n",
|
||||
"which will allow you to access or create ",
|
||||
"databases without opening your web browser."
|
||||
],
|
||||
redirect_uri = redirect_uri,
|
||||
session_id = session_id,
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a TCP connection to a postgres database, authenticate with it, and receive the ReadyForQuery message
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<(String, tokio::net::TcpStream)> {
|
||||
async fn connect_to_db(db_info: DatabaseInfo) -> anyhow::Result<tokio::net::TcpStream> {
|
||||
let mut socket = tokio::net::TcpStream::connect(db_info.socket_addr()?).await?;
|
||||
let config = tokio_postgres::Config::from(db_info);
|
||||
let (client, conn) = config.connect_raw(&mut socket, NoTls).await?;
|
||||
|
||||
let query = client.query_one("select current_setting('server_version')", &[]);
|
||||
|
||||
tokio::pin!(query, conn);
|
||||
|
||||
let version = tokio::select!(
|
||||
x = query => x?.try_get(0)?,
|
||||
_ = conn => bail!("connection closed too early"),
|
||||
);
|
||||
|
||||
Ok((version, socket))
|
||||
let _ = config.connect_raw(&mut socket, NoTls).await?;
|
||||
Ok(socket)
|
||||
}
|
||||
|
||||
/// Concurrently proxy both directions of the client and server connections
|
||||
@@ -256,7 +304,7 @@ fn proxy(
|
||||
// so we can afford to lose `res` in case `flush` fails
|
||||
let res = self.0.write(buf);
|
||||
if res.is_ok() {
|
||||
self.flush()?;
|
||||
self.0.flush()?;
|
||||
}
|
||||
res
|
||||
}
|
||||
@@ -278,3 +326,26 @@ fn proxy(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Proxy a client connection to a postgres database
|
||||
fn proxy_pass(pgb: PostgresBackend, db_info: DatabaseInfo) -> anyhow::Result<()> {
|
||||
let db_stream = {
|
||||
// We'll get rid of this once migration to async is complete
|
||||
let runtime = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let stream = runtime.block_on(connect_to_db(db_info))?.into_std()?;
|
||||
stream.set_nonblocking(false)?;
|
||||
stream
|
||||
};
|
||||
|
||||
let db = zenith_utils::sock_split::BidiStream::from_tcp(db_stream);
|
||||
|
||||
let client = match pgb.into_stream() {
|
||||
Stream::Bidirectional(bidi_stream) => bidi_stream,
|
||||
_ => bail!("invalid stream"),
|
||||
};
|
||||
|
||||
proxy(client.split(), db.split())
|
||||
}
|
||||
|
||||
@@ -1,62 +0,0 @@
|
||||
use crate::cplane_api::DatabaseInfo;
|
||||
use anyhow::{anyhow, ensure, Context};
|
||||
use rustls::{internal::pemfile, NoClientAuth, ProtocolVersion, ServerConfig};
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
|
||||
pub type SslConfig = Arc<ServerConfig>;
|
||||
|
||||
pub struct ProxyConfig {
|
||||
/// main entrypoint for users to connect to
|
||||
pub proxy_address: SocketAddr,
|
||||
|
||||
/// http management endpoint. Upon user account creation control plane
|
||||
/// will notify us here, so that we can 'unfreeze' user session.
|
||||
pub mgmt_address: SocketAddr,
|
||||
|
||||
/// send unauthenticated users to this URI
|
||||
pub redirect_uri: String,
|
||||
|
||||
/// control plane address where we would check auth.
|
||||
pub auth_endpoint: String,
|
||||
|
||||
pub ssl_config: Option<SslConfig>,
|
||||
}
|
||||
|
||||
pub type ProxyWaiters = crate::waiters::Waiters<Result<DatabaseInfo, String>>;
|
||||
|
||||
pub struct ProxyState {
|
||||
pub conf: ProxyConfig,
|
||||
pub waiters: ProxyWaiters,
|
||||
}
|
||||
|
||||
impl ProxyState {
|
||||
pub fn new(conf: ProxyConfig) -> Self {
|
||||
Self {
|
||||
conf,
|
||||
waiters: ProxyWaiters::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn configure_ssl(key_path: &str, cert_path: &str) -> anyhow::Result<SslConfig> {
|
||||
let key = {
|
||||
let key_bytes = std::fs::read(key_path).context("SSL key file")?;
|
||||
let mut keys = pemfile::pkcs8_private_keys(&mut &key_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS keys"))?;
|
||||
ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
|
||||
keys.pop().unwrap()
|
||||
};
|
||||
|
||||
let cert_chain = {
|
||||
let cert_chain_bytes = std::fs::read(cert_path).context("SSL cert file")?;
|
||||
pemfile::certs(&mut &cert_chain_bytes[..])
|
||||
.map_err(|_| anyhow!("couldn't read TLS certificates"))?
|
||||
};
|
||||
|
||||
let mut config = ServerConfig::new(NoClientAuth::new());
|
||||
config.set_single_cert(cert_chain, key)?;
|
||||
config.versions = vec![ProtocolVersion::TLSv1_3];
|
||||
|
||||
Ok(config.into())
|
||||
}
|
||||
@@ -1,58 +0,0 @@
|
||||
use anyhow::{anyhow, Context};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{mpsc, Mutex};
|
||||
|
||||
pub struct Waiters<T>(pub(self) Mutex<HashMap<String, mpsc::Sender<T>>>);
|
||||
|
||||
impl<T> Default for Waiters<T> {
|
||||
fn default() -> Self {
|
||||
Waiters(Default::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Waiters<T> {
|
||||
pub fn register(&self, key: String) -> Waiter<T> {
|
||||
let (tx, rx) = mpsc::channel();
|
||||
|
||||
// TODO: use `try_insert` (unstable)
|
||||
let prev = self.0.lock().unwrap().insert(key.clone(), tx);
|
||||
assert!(matches!(prev, None)); // assert_matches! is nightly-only
|
||||
|
||||
Waiter {
|
||||
receiver: rx,
|
||||
registry: self,
|
||||
key,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn notify(&self, key: &str, value: T) -> anyhow::Result<()>
|
||||
where
|
||||
T: Send + Sync + 'static,
|
||||
{
|
||||
let tx = self
|
||||
.0
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(key)
|
||||
.ok_or_else(|| anyhow!("key {} not found", key))?;
|
||||
tx.send(value).context("channel hangup")
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Waiter<'a, T> {
|
||||
receiver: mpsc::Receiver<T>,
|
||||
registry: &'a Waiters<T>,
|
||||
key: String,
|
||||
}
|
||||
|
||||
impl<T> Waiter<'_, T> {
|
||||
pub fn wait(self) -> anyhow::Result<T> {
|
||||
self.receiver.recv().context("channel hangup")
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Drop for Waiter<'_, T> {
|
||||
fn drop(&mut self) {
|
||||
self.registry.0.lock().unwrap().remove(&self.key);
|
||||
}
|
||||
}
|
||||
510
scripts/coverage
510
scripts/coverage
@@ -1,510 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Here'a good link in case you're interested in learning more
|
||||
# about current deficiencies of rust code coverage story:
|
||||
# https://github.com/rust-lang/rust/issues?q=is%3Aissue+is%3Aopen+instrument-coverage+label%3AA-code-coverage
|
||||
#
|
||||
# Also a couple of inspirational tools which I deliberately ended up not using:
|
||||
# * https://github.com/mozilla/grcov
|
||||
# * https://github.com/taiki-e/cargo-llvm-cov
|
||||
# * https://github.com/llvm/llvm-project/tree/main/llvm/test/tools/llvm-cov
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from textwrap import dedent
|
||||
from typing import Any, Iterable, List, Optional
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def intersperse(sep: Any, iterable: Iterable[Any]):
|
||||
fst = True
|
||||
for item in iterable:
|
||||
if not fst:
|
||||
yield sep
|
||||
fst = False
|
||||
yield item
|
||||
|
||||
|
||||
def find_demangler(demangler=None):
|
||||
known_tools = ['c++filt', 'rustfilt', 'llvm-cxxfilt']
|
||||
|
||||
if demangler:
|
||||
# Explicit argument has precedence over `known_tools`
|
||||
demanglers = [demangler]
|
||||
else:
|
||||
demanglers = known_tools
|
||||
|
||||
for demangler in demanglers:
|
||||
if shutil.which(demangler):
|
||||
return demangler
|
||||
|
||||
raise Exception(' '.join([
|
||||
'Failed to find symbol demangler.',
|
||||
'Please install it or provide another tool',
|
||||
f"(e.g. {', '.join(known_tools)})",
|
||||
]))
|
||||
|
||||
|
||||
class Cargo:
|
||||
def __init__(self, cwd: Path):
|
||||
self.cwd = cwd
|
||||
self.target_dir = Path(os.environ.get('CARGO_TARGET_DIR', cwd / 'target')).resolve()
|
||||
self._rustlib_dir = None
|
||||
|
||||
@property
|
||||
def rustlib_dir(self):
|
||||
if not self._rustlib_dir:
|
||||
cmd = [
|
||||
'cargo',
|
||||
'-Zunstable-options',
|
||||
'rustc',
|
||||
'--print=target-libdir',
|
||||
]
|
||||
self._rustlib_dir = Path(subprocess.check_output(cmd, cwd=self.cwd, text=True)).parent
|
||||
|
||||
return self._rustlib_dir
|
||||
|
||||
def binaries(self, profile: str) -> List[str]:
|
||||
executables = []
|
||||
|
||||
# This will emit json messages containing test binaries names
|
||||
cmd = [
|
||||
'cargo',
|
||||
'test',
|
||||
'--no-run',
|
||||
'--message-format=json',
|
||||
]
|
||||
env = dict(os.environ, PROFILE=profile)
|
||||
output = subprocess.check_output(cmd, cwd=self.cwd, env=env, text=True)
|
||||
|
||||
for line in output.splitlines(keepends=False):
|
||||
meta = json.loads(line)
|
||||
exe = meta.get('executable')
|
||||
if exe:
|
||||
executables.append(exe)
|
||||
|
||||
# Metadata contains crate names, which can be used
|
||||
# to recover names of executables, e.g. `pageserver`
|
||||
cmd = [
|
||||
'cargo',
|
||||
'metadata',
|
||||
'--format-version=1',
|
||||
'--no-deps',
|
||||
]
|
||||
meta = json.loads(subprocess.check_output(cmd, cwd=self.cwd))
|
||||
|
||||
for pkg in meta.get('packages', []):
|
||||
for target in pkg.get('targets', []):
|
||||
if 'bin' in target['kind']:
|
||||
exe = self.target_dir / profile / target['name']
|
||||
if exe.exists():
|
||||
executables.append(str(exe))
|
||||
|
||||
return executables
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLVM:
|
||||
cargo: Cargo
|
||||
|
||||
def resolve_tool(self, name: str) -> str:
|
||||
exe = self.cargo.rustlib_dir / 'bin' / name
|
||||
if exe.exists():
|
||||
return str(exe)
|
||||
|
||||
if not shutil.which(name):
|
||||
# Show a user-friendly warning
|
||||
raise Exception(' '.join([
|
||||
f"It appears that you don't have `{name}` installed.",
|
||||
"Please execute `rustup component add llvm-tools-preview`,",
|
||||
"or install it via your package manager of choice.",
|
||||
"LLVM tools should be the same version as LLVM in `rustc --version --verbose`.",
|
||||
]))
|
||||
|
||||
return name
|
||||
|
||||
def profdata(self, input_dir: Path, output_profdata: Path):
|
||||
profraws = [f for f in input_dir.iterdir() if f.suffix == '.profraw']
|
||||
if not profraws:
|
||||
raise Exception(f'No profraw files found at {input_dir}')
|
||||
|
||||
with open(input_dir / 'profraw.list', 'w') as input_files:
|
||||
profraw_mtime = 0
|
||||
for profraw in profraws:
|
||||
profraw_mtime = max(profraw_mtime, profraw.stat().st_mtime_ns)
|
||||
print(profraw, file=input_files)
|
||||
input_files.flush()
|
||||
|
||||
try:
|
||||
profdata_mtime = output_profdata.stat().st_mtime_ns
|
||||
except FileNotFoundError:
|
||||
profdata_mtime = 0
|
||||
|
||||
# An obvious make-ish optimization
|
||||
if profraw_mtime >= profdata_mtime:
|
||||
subprocess.check_call([
|
||||
self.resolve_tool('llvm-profdata'),
|
||||
'merge',
|
||||
'-sparse',
|
||||
f'-input-files={input_files.name}',
|
||||
f'-output={output_profdata}',
|
||||
])
|
||||
|
||||
def _cov(self,
|
||||
*extras,
|
||||
subcommand: str,
|
||||
profdata: Path,
|
||||
objects: List[str],
|
||||
sources: List[str],
|
||||
demangler: Optional[str] = None) -> None:
|
||||
|
||||
cwd = self.cargo.cwd
|
||||
objects = list(intersperse('-object', objects))
|
||||
extras = list(extras)
|
||||
|
||||
# For some reason `rustc` produces relative paths to src files,
|
||||
# so we force it to cut the $PWD prefix.
|
||||
# see: https://github.com/rust-lang/rust/issues/34701#issuecomment-739809584
|
||||
if sources:
|
||||
extras.append(f'-path-equivalence=.,{cwd.resolve()}')
|
||||
|
||||
if demangler:
|
||||
extras.append(f'-Xdemangler={demangler}')
|
||||
|
||||
cmd = [
|
||||
self.resolve_tool('llvm-cov'),
|
||||
subcommand, # '-dump-collected-paths', # classified debug flag
|
||||
'-instr-profile',
|
||||
str(profdata),
|
||||
*extras,
|
||||
*objects,
|
||||
*sources,
|
||||
]
|
||||
subprocess.check_call(cmd, cwd=cwd)
|
||||
|
||||
def cov_report(self, **kwargs) -> None:
|
||||
self._cov(subcommand='report', **kwargs)
|
||||
|
||||
def cov_export(self, *, kind: str, **kwargs) -> None:
|
||||
extras = [f'-format={kind}']
|
||||
self._cov(subcommand='export', *extras, **kwargs)
|
||||
|
||||
def cov_show(self, *, kind: str, output_dir: Optional[Path] = None, **kwargs) -> None:
|
||||
extras = [f'-format={kind}']
|
||||
if output_dir:
|
||||
extras.append(f'-output-dir={output_dir}')
|
||||
|
||||
self._cov(subcommand='show', *extras, **kwargs)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Report(ABC):
|
||||
""" Common properties of a coverage report """
|
||||
|
||||
llvm: LLVM
|
||||
demangler: str
|
||||
profdata: Path
|
||||
objects: List[str]
|
||||
sources: List[str]
|
||||
|
||||
def _common_kwargs(self):
|
||||
return dict(profdata=self.profdata,
|
||||
objects=self.objects,
|
||||
sources=self.sources,
|
||||
demangler=self.demangler)
|
||||
|
||||
@abstractmethod
|
||||
def generate(self):
|
||||
pass
|
||||
|
||||
def open(self):
|
||||
# Do nothing by default
|
||||
pass
|
||||
|
||||
|
||||
class SummaryReport(Report):
|
||||
def generate(self):
|
||||
self.llvm.cov_report(**self._common_kwargs())
|
||||
|
||||
|
||||
class TextReport(Report):
|
||||
def generate(self):
|
||||
self.llvm.cov_show(kind='text', **self._common_kwargs())
|
||||
|
||||
|
||||
class LcovReport(Report):
|
||||
def generate(self):
|
||||
self.llvm.cov_export(kind='lcov', **self._common_kwargs())
|
||||
|
||||
|
||||
@dataclass
|
||||
class HtmlReport(Report):
|
||||
output_dir: Path
|
||||
|
||||
def generate(self):
|
||||
self.llvm.cov_show(kind='html', output_dir=self.output_dir, **self._common_kwargs())
|
||||
print(f'HTML report is located at `{self.output_dir}`')
|
||||
|
||||
def open(self):
|
||||
tool = dict(linux='xdg-open', darwin='open').get(sys.platform)
|
||||
if not tool:
|
||||
raise Exception(f'Unknown platform {sys.platform}')
|
||||
|
||||
subprocess.check_call([tool, self.output_dir / 'index.html'],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL)
|
||||
|
||||
|
||||
@dataclass
|
||||
class GithubPagesReport(HtmlReport):
|
||||
output_dir: Path
|
||||
commit_url: str
|
||||
|
||||
def generate(self):
|
||||
def index_path(path):
|
||||
return path / 'index.html'
|
||||
|
||||
common = self._common_kwargs()
|
||||
# Provide default sources if there's none
|
||||
common.setdefault('sources', ['.'])
|
||||
|
||||
self.llvm.cov_show(kind='html', output_dir=self.output_dir, **common)
|
||||
shutil.copy(index_path(self.output_dir), self.output_dir / 'local.html')
|
||||
|
||||
with TemporaryDirectory() as tmp:
|
||||
output_dir = Path(tmp)
|
||||
args = dict(common, sources=[])
|
||||
self.llvm.cov_show(kind='html', output_dir=output_dir, **args)
|
||||
shutil.copy(index_path(output_dir), self.output_dir / 'all.html')
|
||||
|
||||
with open(index_path(self.output_dir), 'w') as index:
|
||||
commit_sha = self.commit_url.rsplit('/', maxsplit=1)[-1][:10]
|
||||
|
||||
html = f"""
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Coverage ({commit_sha})</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>
|
||||
Coverage report for commit
|
||||
<a href="{self.commit_url}">
|
||||
{commit_sha}
|
||||
</a>
|
||||
</h1>
|
||||
|
||||
<p>
|
||||
<a href="./local.html">
|
||||
<b>Show only local sources</b>
|
||||
</a>
|
||||
</p>
|
||||
|
||||
<p>
|
||||
<a href="./all.html">
|
||||
Show all sources (including dependencies)
|
||||
</a>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
index.write(dedent(html))
|
||||
|
||||
print(f'HTML report is located at `{self.output_dir}`')
|
||||
|
||||
|
||||
class State:
|
||||
def __init__(self, cwd: Path, top_dir: Optional[Path], profraw_prefix: Optional[str]):
|
||||
# Use hostname by default
|
||||
profraw_prefix = profraw_prefix or '%h'
|
||||
|
||||
self.cwd = cwd
|
||||
self.cargo = Cargo(self.cwd)
|
||||
self.llvm = LLVM(self.cargo)
|
||||
|
||||
self.top_dir = top_dir or self.cargo.target_dir / 'coverage'
|
||||
self.report_dir = self.top_dir / 'report'
|
||||
|
||||
# Directory for raw coverage data emitted by executables
|
||||
self.profraw_dir = self.top_dir / 'profraw'
|
||||
self.profraw_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Aggregated coverage data
|
||||
self.profdata_file = self.top_dir / 'coverage.profdata'
|
||||
|
||||
# Dump all coverage data files into a dedicated directory.
|
||||
# Each filename is parameterized by PID & executable's signature.
|
||||
os.environ['LLVM_PROFILE_FILE'] = str(self.profraw_dir /
|
||||
f'cov-{profraw_prefix}-%p-%m.profraw')
|
||||
|
||||
os.environ['RUSTFLAGS'] = ' '.join([
|
||||
os.environ.get('RUSTFLAGS', ''),
|
||||
# Enable LLVM's source-based coverage
|
||||
# see: https://clang.llvm.org/docs/SourceBasedCodeCoverage.html
|
||||
# see: https://blog.rust-lang.org/inside-rust/2020/11/12/source-based-code-coverage.html
|
||||
'-Zinstrument-coverage',
|
||||
# Link every bit of code to prevent "holes" in coverage report
|
||||
# see: https://doc.rust-lang.org/rustc/codegen-options/index.html#link-dead-code
|
||||
'-Clink-dead-code',
|
||||
# Some of the paths that `rustc` embeds into binaries are absolute, others are relative.
|
||||
# The point is, we can't have both, because depending on `-path-equivalence`, `llvm-cov`
|
||||
# either will cripple absolute paths or won't be able to show relative paths at all.
|
||||
# There's no way to turn relative paths into absolute, so we strip $PWD prefix.
|
||||
# Only source files of deps (e.g. `$HOME/.cargo`) will keep their absolute paths,
|
||||
# but we won't include them in report by default (but see `--all`).
|
||||
f'--remap-path-prefix {self.cwd}=',
|
||||
])
|
||||
|
||||
# XXX: God, have mercy on our souls...
|
||||
# see: https://github.com/rust-lang/rust/pull/90132
|
||||
os.environ['RUSTC_BOOTSTRAP'] = '1'
|
||||
|
||||
def do_run(self, args):
|
||||
subprocess.check_call([*args.command, *args.args])
|
||||
|
||||
def do_report(self, args):
|
||||
if args.all and args.sources:
|
||||
raise Exception('--all should not be used with sources')
|
||||
|
||||
# see man for `llvm-cov show [sources]`
|
||||
if args.all:
|
||||
sources = []
|
||||
elif not args.sources:
|
||||
sources = ['.']
|
||||
else:
|
||||
sources = args.sources
|
||||
|
||||
print('* Merging profraw files')
|
||||
self.llvm.profdata(self.profraw_dir, self.profdata_file)
|
||||
|
||||
objects = []
|
||||
if args.input_objects:
|
||||
print('* Collecting object files using --input-objects')
|
||||
with open(args.input_objects) as f:
|
||||
objects.extend(f.read().splitlines(keepends=False))
|
||||
if args.cargo_objects == 'true' or (args.cargo_objects == 'auto'
|
||||
and not args.input_objects):
|
||||
print('* Collecting object files using cargo')
|
||||
objects.extend(self.cargo.binaries(args.profile))
|
||||
|
||||
params = dict(llvm=self.llvm,
|
||||
demangler=find_demangler(args.demangler),
|
||||
profdata=self.profdata_file,
|
||||
objects=objects,
|
||||
sources=sources)
|
||||
|
||||
formats = {
|
||||
'html':
|
||||
lambda: HtmlReport(**params, output_dir=self.report_dir),
|
||||
'text':
|
||||
lambda: TextReport(**params),
|
||||
'lcov':
|
||||
lambda: LcovReport(**params),
|
||||
'summary':
|
||||
lambda: SummaryReport(**params),
|
||||
'github':
|
||||
lambda: GithubPagesReport(
|
||||
**params, output_dir=self.report_dir, commit_url=args.commit_url),
|
||||
}
|
||||
|
||||
report = formats.get(args.format)()
|
||||
if not report:
|
||||
raise Exception('Format `{args.format}` is not supported')
|
||||
|
||||
print(f'* Rendering coverage report ({args.format})')
|
||||
report.generate()
|
||||
|
||||
if args.open:
|
||||
print('* Opening the report')
|
||||
report.open()
|
||||
|
||||
def do_clean(self, args):
|
||||
# Wipe everything if no filters have been provided
|
||||
if not (args.report or args.prof):
|
||||
shutil.rmtree(self.top_dir, ignore_errors=True)
|
||||
else:
|
||||
if args.report:
|
||||
shutil.rmtree(self.report_dir, ignore_errors=True)
|
||||
if args.prof:
|
||||
self.profdata_file.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def main():
|
||||
app = sys.argv[0]
|
||||
example = f"""
|
||||
prerequisites:
|
||||
# alternatively, install a system package for `llvm-tools`
|
||||
rustup component add llvm-tools-preview
|
||||
|
||||
self-contained example:
|
||||
{app} run make
|
||||
{app} run pipenv run pytest test_runner
|
||||
{app} run cargo test
|
||||
{app} report --open
|
||||
"""
|
||||
|
||||
parser = argparse.ArgumentParser(description='Coverage report builder',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=example)
|
||||
parser.add_argument('--dir', type=Path, help='output directory')
|
||||
parser.add_argument('--profraw-prefix', metavar='STRING', type=str)
|
||||
|
||||
commands = parser.add_subparsers(title='commands', dest='subparser_name')
|
||||
|
||||
p_run = commands.add_parser('run', help='run a command with magic env')
|
||||
p_run.add_argument('command', nargs=1)
|
||||
p_run.add_argument('args', nargs=argparse.REMAINDER)
|
||||
|
||||
p_report = commands.add_parser('report', help='generate a coverage report')
|
||||
p_report.add_argument('--profile',
|
||||
default='debug',
|
||||
choices=('debug', 'release'),
|
||||
help='cargo build profile')
|
||||
p_report.add_argument('--format',
|
||||
default='html',
|
||||
choices=('html', 'text', 'summary', 'lcov', 'github'),
|
||||
help='report format')
|
||||
p_report.add_argument('--input-objects',
|
||||
metavar='FILE',
|
||||
type=Path,
|
||||
help='file containing list of binaries')
|
||||
p_report.add_argument('--cargo-objects',
|
||||
default='auto',
|
||||
choices=('auto', 'true', 'false'),
|
||||
help='use cargo for auto discovery of binaries')
|
||||
p_report.add_argument('--commit-url', type=str, help='required for --format=github')
|
||||
p_report.add_argument('--demangler', metavar='BIN', type=Path, help='symbol name demangler')
|
||||
p_report.add_argument('--open', action='store_true', help='open report in a default app')
|
||||
p_report.add_argument('--all', action='store_true', help='show everything, e.g. deps')
|
||||
p_report.add_argument('sources', nargs='*', type=Path, help='source file or directory')
|
||||
|
||||
p_clean = commands.add_parser('clean', help='wipe coverage artifacts')
|
||||
p_clean.add_argument('--report', action='store_true', help='pick generated report')
|
||||
p_clean.add_argument('--prof', action='store_true', help='pick *.profdata & *.profraw')
|
||||
|
||||
args = parser.parse_args()
|
||||
state = State(cwd=Path.cwd(), top_dir=args.dir, profraw_prefix=args.profraw_prefix)
|
||||
|
||||
commands = {
|
||||
'run': state.do_run,
|
||||
'report': state.do_report,
|
||||
'clean': state.do_clean,
|
||||
}
|
||||
|
||||
action = commands.get(args.subparser_name)
|
||||
if action:
|
||||
action(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# this is a shortcut script to avoid duplication in CI
|
||||
|
||||
set -eux -o pipefail
|
||||
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
|
||||
git clone https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git
|
||||
cd zenith-perf-data
|
||||
mkdir -p reports/
|
||||
mkdir -p data/$REPORT_TO
|
||||
|
||||
cp $REPORT_FROM/* data/$REPORT_TO
|
||||
|
||||
echo "Generating report"
|
||||
pipenv run python $SCRIPT_DIR/generate_perf_report_page.py --input-dir data/$REPORT_TO --out reports/$REPORT_TO.html
|
||||
echo "Uploading perf result"
|
||||
git add data reports
|
||||
git \
|
||||
-c "user.name=vipvap" \
|
||||
-c "user.email=vipvap@zenith.tech" \
|
||||
commit \
|
||||
--author="vipvap <vipvap@zenith.tech>" \
|
||||
-m "add performance test result for $GITHUB_SHA zenith revision"
|
||||
|
||||
git push https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-perf-data.git master
|
||||
@@ -1,207 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
import json
|
||||
from typing import Any, Dict, List, Optional, Tuple, cast
|
||||
from jinja2 import Template
|
||||
|
||||
# skip 'input' columns. They are included in the header and just blow the table
|
||||
EXCLUDE_COLUMNS = frozenset({
|
||||
'scale',
|
||||
'duration',
|
||||
'number_of_clients',
|
||||
'number_of_threads',
|
||||
'init_start_timestamp',
|
||||
'init_end_timestamp',
|
||||
'run_start_timestamp',
|
||||
'run_end_timestamp',
|
||||
})
|
||||
|
||||
KEY_EXCLUDE_FIELDS = frozenset({
|
||||
'init_start_timestamp',
|
||||
'init_end_timestamp',
|
||||
'run_start_timestamp',
|
||||
'run_end_timestamp',
|
||||
})
|
||||
NEGATIVE_COLOR = 'negative'
|
||||
POSITIVE_COLOR = 'positive'
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuitRun:
|
||||
revision: str
|
||||
values: Dict[str, Any]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuitRuns:
|
||||
platform: str
|
||||
suit: str
|
||||
common_columns: List[Tuple[str, str]]
|
||||
value_columns: List[str]
|
||||
runs: List[SuitRun]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RowValue:
|
||||
value: str
|
||||
color: str
|
||||
ratio: str
|
||||
|
||||
|
||||
def get_columns(values: List[Dict[Any, Any]]) -> Tuple[List[Tuple[str, str]], List[str]]:
|
||||
value_columns = []
|
||||
common_columns = []
|
||||
for item in values:
|
||||
if item['name'] in KEY_EXCLUDE_FIELDS:
|
||||
continue
|
||||
if item['report'] != 'test_param':
|
||||
value_columns.append(cast(str, item['name']))
|
||||
else:
|
||||
common_columns.append((cast(str, item['name']), cast(str, item['value'])))
|
||||
value_columns.sort()
|
||||
common_columns.sort(key=lambda x: x[0]) # sort by name
|
||||
return common_columns, value_columns
|
||||
|
||||
|
||||
def format_ratio(ratio: float, report: str) -> Tuple[str, str]:
|
||||
color = ''
|
||||
sign = '+' if ratio > 0 else ''
|
||||
if abs(ratio) < 0.05:
|
||||
return f' ({sign}{ratio:.2f})', color
|
||||
|
||||
if report not in {'test_param', 'higher_is_better', 'lower_is_better'}:
|
||||
raise ValueError(f'Unknown report type: {report}')
|
||||
|
||||
if report == 'test_param':
|
||||
return f'{ratio:.2f}', color
|
||||
|
||||
if ratio > 0:
|
||||
if report == 'higher_is_better':
|
||||
color = POSITIVE_COLOR
|
||||
elif report == 'lower_is_better':
|
||||
color = NEGATIVE_COLOR
|
||||
elif ratio < 0:
|
||||
if report == 'higher_is_better':
|
||||
color = NEGATIVE_COLOR
|
||||
elif report == 'lower_is_better':
|
||||
color = POSITIVE_COLOR
|
||||
|
||||
return f' ({sign}{ratio:.2f})', color
|
||||
|
||||
|
||||
def extract_value(name: str, suit_run: SuitRun) -> Optional[Dict[str, Any]]:
|
||||
for item in suit_run.values['data']:
|
||||
if item['name'] == name:
|
||||
return cast(Dict[str, Any], item)
|
||||
return None
|
||||
|
||||
|
||||
def get_row_values(columns: List[str], run_result: SuitRun,
|
||||
prev_result: Optional[SuitRun]) -> List[RowValue]:
|
||||
row_values = []
|
||||
for column in columns:
|
||||
current_value = extract_value(column, run_result)
|
||||
if current_value is None:
|
||||
# should never happen
|
||||
raise ValueError(f'{column} not found in {run_result.values}')
|
||||
|
||||
value = current_value["value"]
|
||||
if isinstance(value, float):
|
||||
value = f'{value:.2f}'
|
||||
|
||||
if prev_result is None:
|
||||
row_values.append(RowValue(value, '', ''))
|
||||
continue
|
||||
|
||||
prev_value = extract_value(column, prev_result)
|
||||
if prev_value is None:
|
||||
# this might happen when new metric is added and there is no value for it in previous run
|
||||
# let this be here, TODO add proper handling when this actually happens
|
||||
raise ValueError(f'{column} not found in previous result')
|
||||
ratio = float(value) / float(prev_value['value']) - 1
|
||||
ratio_display, color = format_ratio(ratio, current_value['report'])
|
||||
row_values.append(RowValue(value, color, ratio_display))
|
||||
return row_values
|
||||
|
||||
|
||||
@dataclass
|
||||
class SuiteRunTableRow:
|
||||
revision: str
|
||||
values: List[RowValue]
|
||||
|
||||
|
||||
def prepare_rows_from_runs(value_columns: List[str], runs: List[SuitRun]) -> List[SuiteRunTableRow]:
|
||||
rows = []
|
||||
prev_run = None
|
||||
for run in runs:
|
||||
rows.append(
|
||||
SuiteRunTableRow(revision=run.revision,
|
||||
values=get_row_values(value_columns, run, prev_run)))
|
||||
prev_run = run
|
||||
|
||||
return rows
|
||||
|
||||
|
||||
def main(args: argparse.Namespace) -> None:
|
||||
input_dir = Path(args.input_dir)
|
||||
grouped_runs: Dict[str, SuitRuns] = {}
|
||||
# we have files in form: <ctr>_<rev>.json
|
||||
# fill them in the hashmap so we have grouped items for the
|
||||
# same run configuration (scale, duration etc.) ordered by counter.
|
||||
for item in sorted(input_dir.iterdir(), key=lambda x: int(x.name.split('_')[0])):
|
||||
run_data = json.loads(item.read_text())
|
||||
revision = run_data['revision']
|
||||
|
||||
for suit_result in run_data['result']:
|
||||
key = "{}{}".format(run_data['platform'], suit_result['suit'])
|
||||
# pack total duration as a synthetic value
|
||||
total_duration = suit_result['total_duration']
|
||||
suit_result['data'].append({
|
||||
'name': 'total_duration',
|
||||
'value': total_duration,
|
||||
'unit': 's',
|
||||
'report': 'lower_is_better',
|
||||
})
|
||||
common_columns, value_columns = get_columns(suit_result['data'])
|
||||
|
||||
grouped_runs.setdefault(
|
||||
key,
|
||||
SuitRuns(
|
||||
platform=run_data['platform'],
|
||||
suit=suit_result['suit'],
|
||||
common_columns=common_columns,
|
||||
value_columns=value_columns,
|
||||
runs=[],
|
||||
),
|
||||
)
|
||||
|
||||
grouped_runs[key].runs.append(SuitRun(revision=revision, values=suit_result))
|
||||
context = {}
|
||||
for result in grouped_runs.values():
|
||||
suit = result.suit
|
||||
context[suit] = {
|
||||
'common_columns': result.common_columns,
|
||||
'value_columns': result.value_columns,
|
||||
'platform': result.platform,
|
||||
# reverse the order so newest results are on top of the table
|
||||
'rows': reversed(prepare_rows_from_runs(result.value_columns, result.runs)),
|
||||
}
|
||||
|
||||
template = Template((Path(__file__).parent / 'perf_report_template.html').read_text())
|
||||
|
||||
Path(args.out).write_text(template.render(context=context))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'--input-dir',
|
||||
dest='input_dir',
|
||||
required=True,
|
||||
help='Directory with jsons generated by the test suite',
|
||||
)
|
||||
parser.add_argument('--out', required=True, help='Output html file path')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
@@ -1,136 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
from contextlib import contextmanager
|
||||
from tempfile import TemporaryDirectory
|
||||
from pathlib import Path
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
|
||||
def absolute_path(path):
|
||||
return Path(path).resolve()
|
||||
|
||||
|
||||
def relative_path(path):
|
||||
path = Path(path)
|
||||
if path.is_absolute():
|
||||
raise Exception(f'path `{path}` must be relative!')
|
||||
return path
|
||||
|
||||
|
||||
@contextmanager
|
||||
def chdir(cwd: Path):
|
||||
old = os.getcwd()
|
||||
os.chdir(cwd)
|
||||
try:
|
||||
yield cwd
|
||||
finally:
|
||||
os.chdir(old)
|
||||
|
||||
|
||||
def run(cmd, *args, **kwargs):
|
||||
print('$', ' '.join(cmd))
|
||||
subprocess.check_call(cmd, *args, **kwargs)
|
||||
|
||||
|
||||
class GitRepo:
|
||||
def __init__(self, url):
|
||||
self.url = url
|
||||
self.cwd = TemporaryDirectory()
|
||||
|
||||
subprocess.check_call([
|
||||
'git',
|
||||
'clone',
|
||||
str(url),
|
||||
self.cwd.name,
|
||||
])
|
||||
|
||||
def is_dirty(self):
|
||||
res = subprocess.check_output(['git', 'status', '--porcelain'], text=True).strip()
|
||||
return bool(res)
|
||||
|
||||
def update(self, message, action, branch=None):
|
||||
with chdir(self.cwd.name):
|
||||
if not branch:
|
||||
cmd = ['git', 'branch', '--show-current']
|
||||
branch = subprocess.check_output(cmd, text=True).strip()
|
||||
|
||||
# Run action in repo's directory
|
||||
action()
|
||||
|
||||
run(['git', 'add', '.'])
|
||||
|
||||
if not self.is_dirty():
|
||||
print('No changes detected, quitting')
|
||||
return
|
||||
|
||||
run([
|
||||
'git',
|
||||
'-c',
|
||||
'user.name=vipvap',
|
||||
'-c',
|
||||
'user.email=vipvap@zenith.tech',
|
||||
'commit',
|
||||
'--author="vipvap <vipvap@zenith.tech>"',
|
||||
f'--message={message}',
|
||||
])
|
||||
|
||||
for _ in range(5):
|
||||
try:
|
||||
run(['git', 'fetch', 'origin', branch])
|
||||
run(['git', 'rebase', f'origin/{branch}'])
|
||||
run(['git', 'push', 'origin', branch])
|
||||
return
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f'failed to update branch `{branch}`: {e}', file=sys.stderr)
|
||||
|
||||
raise Exception(f'failed to update branch `{branch}`')
|
||||
|
||||
|
||||
def do_copy(args):
|
||||
src = args.src
|
||||
dst = args.dst
|
||||
|
||||
try:
|
||||
if src.is_dir():
|
||||
shutil.copytree(src, dst)
|
||||
else:
|
||||
shutil.copy(src, dst)
|
||||
except FileExistsError:
|
||||
if args.forbid_overwrite:
|
||||
raise
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Git upload tool')
|
||||
parser.add_argument('--repo', type=str, metavar='URL', required=True, help='git repo url')
|
||||
parser.add_argument('--message', type=str, metavar='TEXT', help='commit message')
|
||||
|
||||
commands = parser.add_subparsers(title='commands', dest='subparser_name')
|
||||
|
||||
p_copy = commands.add_parser('copy', help='copy file into the repo')
|
||||
p_copy.add_argument('src', type=absolute_path, help='source path')
|
||||
p_copy.add_argument('dst', type=relative_path, help='relative dest path')
|
||||
p_copy.add_argument('--forbid-overwrite', action='store_true', help='do not allow overwrites')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
commands = {
|
||||
'copy': do_copy,
|
||||
}
|
||||
|
||||
action = commands.get(args.subparser_name)
|
||||
if action:
|
||||
message = args.message or 'update'
|
||||
GitRepo(args.repo).update(message, lambda: action(args))
|
||||
else:
|
||||
parser.print_usage()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,52 +0,0 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
|
||||
<body>
|
||||
<style>
|
||||
table,
|
||||
th,
|
||||
td {
|
||||
border: 1px solid black;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
|
||||
.positive {
|
||||
background-color: rgba(0, 255, 0, 0.8)
|
||||
}
|
||||
|
||||
.negative {
|
||||
background-color: rgba(255, 0, 0, 0.65)
|
||||
}
|
||||
</style>
|
||||
|
||||
<h2>Zenith Performance Tests</h2>
|
||||
|
||||
{% for suit_name, suit_data in context.items() %}
|
||||
<h3>Runs for {{ suit_name }} </h3>
|
||||
<b>platform:</b> {{ suit_data.platform }}<br>
|
||||
{% for common_column_name, common_column_value in suit_data.common_columns %}
|
||||
<b>{{ common_column_name }}</b>: {{ common_column_value }}<br>
|
||||
{% endfor %}
|
||||
<br>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>revision</th>
|
||||
{% for column_name in suit_data.value_columns %}
|
||||
<th>{{ column_name }}</th>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% for row in suit_data.rows %}
|
||||
<tr>
|
||||
<td><a href=https://github.com/zenithdb/zenith/commit/{{ row.revision }}>{{ row.revision[:6] }}</a></td>
|
||||
{% for column_value in row.values %}
|
||||
<td class="{{ column_value.color }}">{{ column_value.value }}{{column_value.ratio}}</td>
|
||||
{% endfor %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
{% endfor %}
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
29
test_runner/Pipfile
Normal file
29
test_runner/Pipfile
Normal file
@@ -0,0 +1,29 @@
|
||||
[[source]]
|
||||
url = "https://pypi.python.org/simple"
|
||||
verify_ssl = true
|
||||
name = "pypi"
|
||||
|
||||
[packages]
|
||||
pytest = ">=6.0.0"
|
||||
psycopg2 = "*"
|
||||
typing-extensions = "*"
|
||||
pyjwt = {extras = ["crypto"], version = "*"}
|
||||
requests = "*"
|
||||
pytest-xdist = "*"
|
||||
asyncpg = "*"
|
||||
cached-property = "*"
|
||||
|
||||
[dev-packages]
|
||||
# Behavior may change slightly between versions. These are run continuously,
|
||||
# so we pin exact versions to avoid suprising breaks. Update if comfortable.
|
||||
yapf = "==0.31.0"
|
||||
mypy = "==0.910"
|
||||
# Non-pinned packages follow.
|
||||
pipenv = "*"
|
||||
flake8 = "*"
|
||||
types-requests = "*"
|
||||
types-psycopg2 = "*"
|
||||
|
||||
[requires]
|
||||
# we need at least 3.7, but pipenv doesn't allow to say this directly
|
||||
python_version = "3"
|
||||
544
test_runner/Pipfile.lock
generated
Normal file
544
test_runner/Pipfile.lock
generated
Normal file
@@ -0,0 +1,544 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "63b72760ef37375186a638066ba0ad5804dbace99ddc503ea654e9749070ab24"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
"python_version": "3"
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "pypi",
|
||||
"url": "https://pypi.python.org/simple",
|
||||
"verify_ssl": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"default": {
|
||||
"asyncpg": {
|
||||
"hashes": [
|
||||
"sha256:129d501f3d30616afd51eb8d3142ef51ba05374256bd5834cec3ef4956a9b317",
|
||||
"sha256:29ef6ae0a617fc13cc2ac5dc8e9b367bb83cba220614b437af9b67766f4b6b20",
|
||||
"sha256:41704c561d354bef01353835a7846e5606faabbeb846214dfcf666cf53319f18",
|
||||
"sha256:556b0e92e2b75dc028b3c4bc9bd5162ddf0053b856437cf1f04c97f9c6837d03",
|
||||
"sha256:8ff5073d4b654e34bd5eaadc01dc4d68b8a9609084d835acd364cd934190a08d",
|
||||
"sha256:a458fc69051fbb67d995fdda46d75a012b5d6200f91e17d23d4751482640ed4c",
|
||||
"sha256:a7095890c96ba36f9f668eb552bb020dddb44f8e73e932f8573efc613ee83843",
|
||||
"sha256:a738f4807c853623d3f93f0fea11f61be6b0e5ca16ea8aeb42c2c7ee742aa853",
|
||||
"sha256:c4fc0205fe4ddd5aeb3dfdc0f7bafd43411181e1f5650189608e5971cceacff1",
|
||||
"sha256:dd2fa063c3344823487d9ddccb40802f02622ddf8bf8a6cc53885ee7a2c1c0c6",
|
||||
"sha256:ddffcb85227bf39cd1bedd4603e0082b243cf3b14ced64dce506a15b05232b83",
|
||||
"sha256:e36c6806883786b19551bb70a4882561f31135dc8105a59662e0376cf5b2cbc5",
|
||||
"sha256:eed43abc6ccf1dc02e0d0efc06ce46a411362f3358847c6b0ec9a43426f91ece"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.24.0"
|
||||
},
|
||||
"attrs": {
|
||||
"hashes": [
|
||||
"sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1",
|
||||
"sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"cached-property": {
|
||||
"hashes": [
|
||||
"sha256:9fa5755838eecbb2d234c3aa390bd80fbd3ac6b6869109bfc1b499f7bd89a130",
|
||||
"sha256:df4f613cf7ad9a588cc381aaf4a512d26265ecebd5eb9e1ba12f1319eb85a6a0"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==1.5.2"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"cffi": {
|
||||
"hashes": [
|
||||
"sha256:00c878c90cb53ccfaae6b8bc18ad05d2036553e6d9d1d9dbcf323bbe83854ca3",
|
||||
"sha256:0104fb5ae2391d46a4cb082abdd5c69ea4eab79d8d44eaaf79f1b1fd806ee4c2",
|
||||
"sha256:06c48159c1abed75c2e721b1715c379fa3200c7784271b3c46df01383b593636",
|
||||
"sha256:0808014eb713677ec1292301ea4c81ad277b6cdf2fdd90fd540af98c0b101d20",
|
||||
"sha256:10dffb601ccfb65262a27233ac273d552ddc4d8ae1bf93b21c94b8511bffe728",
|
||||
"sha256:14cd121ea63ecdae71efa69c15c5543a4b5fbcd0bbe2aad864baca0063cecf27",
|
||||
"sha256:17771976e82e9f94976180f76468546834d22a7cc404b17c22df2a2c81db0c66",
|
||||
"sha256:181dee03b1170ff1969489acf1c26533710231c58f95534e3edac87fff06c443",
|
||||
"sha256:23cfe892bd5dd8941608f93348c0737e369e51c100d03718f108bf1add7bd6d0",
|
||||
"sha256:263cc3d821c4ab2213cbe8cd8b355a7f72a8324577dc865ef98487c1aeee2bc7",
|
||||
"sha256:2756c88cbb94231c7a147402476be2c4df2f6078099a6f4a480d239a8817ae39",
|
||||
"sha256:27c219baf94952ae9d50ec19651a687b826792055353d07648a5695413e0c605",
|
||||
"sha256:2a23af14f408d53d5e6cd4e3d9a24ff9e05906ad574822a10563efcef137979a",
|
||||
"sha256:31fb708d9d7c3f49a60f04cf5b119aeefe5644daba1cd2a0fe389b674fd1de37",
|
||||
"sha256:3415c89f9204ee60cd09b235810be700e993e343a408693e80ce7f6a40108029",
|
||||
"sha256:3773c4d81e6e818df2efbc7dd77325ca0dcb688116050fb2b3011218eda36139",
|
||||
"sha256:3b96a311ac60a3f6be21d2572e46ce67f09abcf4d09344c49274eb9e0bf345fc",
|
||||
"sha256:3f7d084648d77af029acb79a0ff49a0ad7e9d09057a9bf46596dac9514dc07df",
|
||||
"sha256:41d45de54cd277a7878919867c0f08b0cf817605e4eb94093e7516505d3c8d14",
|
||||
"sha256:4238e6dab5d6a8ba812de994bbb0a79bddbdf80994e4ce802b6f6f3142fcc880",
|
||||
"sha256:45db3a33139e9c8f7c09234b5784a5e33d31fd6907800b316decad50af323ff2",
|
||||
"sha256:45e8636704eacc432a206ac7345a5d3d2c62d95a507ec70d62f23cd91770482a",
|
||||
"sha256:4958391dbd6249d7ad855b9ca88fae690783a6be9e86df65865058ed81fc860e",
|
||||
"sha256:4a306fa632e8f0928956a41fa8e1d6243c71e7eb59ffbd165fc0b41e316b2474",
|
||||
"sha256:57e9ac9ccc3101fac9d6014fba037473e4358ef4e89f8e181f8951a2c0162024",
|
||||
"sha256:59888172256cac5629e60e72e86598027aca6bf01fa2465bdb676d37636573e8",
|
||||
"sha256:5e069f72d497312b24fcc02073d70cb989045d1c91cbd53979366077959933e0",
|
||||
"sha256:64d4ec9f448dfe041705426000cc13e34e6e5bb13736e9fd62e34a0b0c41566e",
|
||||
"sha256:6dc2737a3674b3e344847c8686cf29e500584ccad76204efea14f451d4cc669a",
|
||||
"sha256:74fdfdbfdc48d3f47148976f49fab3251e550a8720bebc99bf1483f5bfb5db3e",
|
||||
"sha256:75e4024375654472cc27e91cbe9eaa08567f7fbdf822638be2814ce059f58032",
|
||||
"sha256:786902fb9ba7433aae840e0ed609f45c7bcd4e225ebb9c753aa39725bb3e6ad6",
|
||||
"sha256:8b6c2ea03845c9f501ed1313e78de148cd3f6cad741a75d43a29b43da27f2e1e",
|
||||
"sha256:91d77d2a782be4274da750752bb1650a97bfd8f291022b379bb8e01c66b4e96b",
|
||||
"sha256:91ec59c33514b7c7559a6acda53bbfe1b283949c34fe7440bcf917f96ac0723e",
|
||||
"sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954",
|
||||
"sha256:a5263e363c27b653a90078143adb3d076c1a748ec9ecc78ea2fb916f9b861962",
|
||||
"sha256:abb9a20a72ac4e0fdb50dae135ba5e77880518e742077ced47eb1499e29a443c",
|
||||
"sha256:c2051981a968d7de9dd2d7b87bcb9c939c74a34626a6e2f8181455dd49ed69e4",
|
||||
"sha256:c21c9e3896c23007803a875460fb786118f0cdd4434359577ea25eb556e34c55",
|
||||
"sha256:c2502a1a03b6312837279c8c1bd3ebedf6c12c4228ddbad40912d671ccc8a962",
|
||||
"sha256:d4d692a89c5cf08a8557fdeb329b82e7bf609aadfaed6c0d79f5a449a3c7c023",
|
||||
"sha256:da5db4e883f1ce37f55c667e5c0de439df76ac4cb55964655906306918e7363c",
|
||||
"sha256:e7022a66d9b55e93e1a845d8c9eba2a1bebd4966cd8bfc25d9cd07d515b33fa6",
|
||||
"sha256:ef1f279350da2c586a69d32fc8733092fd32cc8ac95139a00377841f59a3f8d8",
|
||||
"sha256:f54a64f8b0c8ff0b64d18aa76675262e1700f3995182267998c31ae974fbc382",
|
||||
"sha256:f5c7150ad32ba43a07c4479f40241756145a1f03b43480e058cfd862bf5041c7",
|
||||
"sha256:f6f824dc3bce0edab5f427efcfb1d63ee75b6fcb7282900ccaf925be84efb0fc",
|
||||
"sha256:fd8a250edc26254fe5b33be00402e6d287f562b6a5b2152dec302fa15bb3e997",
|
||||
"sha256:ffaa5c925128e29efbde7301d8ecaf35c8c60ffbcd6a1ffd3a552177c8e5e796"
|
||||
],
|
||||
"version": "==1.15.0"
|
||||
},
|
||||
"charset-normalizer": {
|
||||
"hashes": [
|
||||
"sha256:e019de665e2bcf9c2b64e2e5aa025fa991da8720daa3c1138cadd2fd1856aed0",
|
||||
"sha256:f7af805c321bfa1ce6714c51f254e0d5bb5e5834039bc17db7ebe3a4cec9492b"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==2.0.7"
|
||||
},
|
||||
"cryptography": {
|
||||
"hashes": [
|
||||
"sha256:07bb7fbfb5de0980590ddfc7f13081520def06dc9ed214000ad4372fb4e3c7f6",
|
||||
"sha256:18d90f4711bf63e2fb21e8c8e51ed8189438e6b35a6d996201ebd98a26abbbe6",
|
||||
"sha256:1ed82abf16df40a60942a8c211251ae72858b25b7421ce2497c2eb7a1cee817c",
|
||||
"sha256:22a38e96118a4ce3b97509443feace1d1011d0571fae81fc3ad35f25ba3ea999",
|
||||
"sha256:2d69645f535f4b2c722cfb07a8eab916265545b3475fdb34e0be2f4ee8b0b15e",
|
||||
"sha256:4a2d0e0acc20ede0f06ef7aa58546eee96d2592c00f450c9acb89c5879b61992",
|
||||
"sha256:54b2605e5475944e2213258e0ab8696f4f357a31371e538ef21e8d61c843c28d",
|
||||
"sha256:7075b304cd567694dc692ffc9747f3e9cb393cc4aa4fb7b9f3abd6f5c4e43588",
|
||||
"sha256:7b7ceeff114c31f285528ba8b390d3e9cfa2da17b56f11d366769a807f17cbaa",
|
||||
"sha256:7eba2cebca600a7806b893cb1d541a6e910afa87e97acf2021a22b32da1df52d",
|
||||
"sha256:928185a6d1ccdb816e883f56ebe92e975a262d31cc536429041921f8cb5a62fd",
|
||||
"sha256:9933f28f70d0517686bd7de36166dda42094eac49415459d9bdf5e7df3e0086d",
|
||||
"sha256:a688ebcd08250eab5bb5bca318cc05a8c66de5e4171a65ca51db6bd753ff8953",
|
||||
"sha256:abb5a361d2585bb95012a19ed9b2c8f412c5d723a9836418fab7aaa0243e67d2",
|
||||
"sha256:c10c797ac89c746e488d2ee92bd4abd593615694ee17b2500578b63cad6b93a8",
|
||||
"sha256:ced40344e811d6abba00295ced98c01aecf0c2de39481792d87af4fa58b7b4d6",
|
||||
"sha256:d57e0cdc1b44b6cdf8af1d01807db06886f10177469312fbde8f44ccbb284bc9",
|
||||
"sha256:d99915d6ab265c22873f1b4d6ea5ef462ef797b4140be4c9d8b179915e0985c6",
|
||||
"sha256:eb80e8a1f91e4b7ef8b33041591e6d89b2b8e122d787e87eeb2b08da71bb16ad",
|
||||
"sha256:ebeddd119f526bcf323a89f853afb12e225902a24d29b55fe18dd6fcb2838a76"
|
||||
],
|
||||
"version": "==35.0.0"
|
||||
},
|
||||
"execnet": {
|
||||
"hashes": [
|
||||
"sha256:8f694f3ba9cc92cab508b152dcfe322153975c29bda272e2fd7f3f00f36e47c5",
|
||||
"sha256:a295f7cc774947aac58dde7fdc85f4aa00c42adf5d8f5468fc630c1acf30a142"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.9.0"
|
||||
},
|
||||
"idna": {
|
||||
"hashes": [
|
||||
"sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff",
|
||||
"sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"
|
||||
],
|
||||
"markers": "python_version >= '3'",
|
||||
"version": "==3.3"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
|
||||
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.8.1"
|
||||
},
|
||||
"iniconfig": {
|
||||
"hashes": [
|
||||
"sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3",
|
||||
"sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"
|
||||
],
|
||||
"version": "==1.1.1"
|
||||
},
|
||||
"packaging": {
|
||||
"hashes": [
|
||||
"sha256:7dc96269f53a4ccec5c0670940a4281106dd0bb343f47b7471f779df49c2fbe7",
|
||||
"sha256:c86254f9220d55e31cc94d69bade760f0847da8000def4dfe1c6b872fd14ff14"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==21.0"
|
||||
},
|
||||
"pluggy": {
|
||||
"hashes": [
|
||||
"sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159",
|
||||
"sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==1.0.0"
|
||||
},
|
||||
"psycopg2": {
|
||||
"hashes": [
|
||||
"sha256:079d97fc22de90da1d370c90583659a9f9a6ee4007355f5825e5f1c70dffc1fa",
|
||||
"sha256:2087013c159a73e09713294a44d0c8008204d06326006b7f652bef5ace66eebb",
|
||||
"sha256:2c992196719fadda59f72d44603ee1a2fdcc67de097eea38d41c7ad9ad246e62",
|
||||
"sha256:7640e1e4d72444ef012e275e7b53204d7fab341fb22bc76057ede22fe6860b25",
|
||||
"sha256:7f91312f065df517187134cce8e395ab37f5b601a42446bdc0f0d51773621854",
|
||||
"sha256:830c8e8dddab6b6716a4bf73a09910c7954a92f40cf1d1e702fb93c8a919cc56",
|
||||
"sha256:89409d369f4882c47f7ea20c42c5046879ce22c1e4ea20ef3b00a4dfc0a7f188",
|
||||
"sha256:bf35a25f1aaa8a3781195595577fcbb59934856ee46b4f252f56ad12b8043bcf",
|
||||
"sha256:de5303a6f1d0a7a34b9d40e4d3bef684ccc44a49bbe3eb85e3c0bffb4a131b7c"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"py": {
|
||||
"hashes": [
|
||||
"sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3",
|
||||
"sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.10.0"
|
||||
},
|
||||
"pycparser": {
|
||||
"hashes": [
|
||||
"sha256:2d475327684562c3a96cc71adf7dc8c4f0565175cf86b6d7a404ff4c771f15f0",
|
||||
"sha256:7582ad22678f0fcd81102833f60ef8d0e57288b6b5fb00323d101be910e35705"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.20"
|
||||
},
|
||||
"pyjwt": {
|
||||
"extras": [
|
||||
"crypto"
|
||||
],
|
||||
"hashes": [
|
||||
"sha256:b888b4d56f06f6dcd777210c334e69c737be74755d3e5e9ee3fe67dc18a0ee41",
|
||||
"sha256:e0c4bb8d9f0af0c7f5b1ec4c5036309617d03d56932877f2f7a0beeb5318322f"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.3.0"
|
||||
},
|
||||
"pyparsing": {
|
||||
"hashes": [
|
||||
"sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1",
|
||||
"sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.7"
|
||||
},
|
||||
"pytest": {
|
||||
"hashes": [
|
||||
"sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89",
|
||||
"sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==6.2.5"
|
||||
},
|
||||
"pytest-forked": {
|
||||
"hashes": [
|
||||
"sha256:6aa9ac7e00ad1a539c41bec6d21011332de671e938c7637378ec9710204e37ca",
|
||||
"sha256:dc4147784048e70ef5d437951728825a131b81714b398d5d52f17c7c144d8815"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==1.3.0"
|
||||
},
|
||||
"pytest-xdist": {
|
||||
"hashes": [
|
||||
"sha256:7b61ebb46997a0820a263553179d6d1e25a8c50d8a8620cd1aa1e20e3be99168",
|
||||
"sha256:89b330316f7fc475f999c81b577c2b926c9569f3d397ae432c0c2e2496d61ff9"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24",
|
||||
"sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.26.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"urllib3": {
|
||||
"hashes": [
|
||||
"sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece",
|
||||
"sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'",
|
||||
"version": "==1.26.7"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
|
||||
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.6.0"
|
||||
}
|
||||
},
|
||||
"develop": {
|
||||
"backports.entry-points-selectable": {
|
||||
"hashes": [
|
||||
"sha256:988468260ec1c196dab6ae1149260e2f5472c9110334e5d51adcb77867361f6a",
|
||||
"sha256:a6d9a871cde5e15b4c4a53e3d43ba890cc6861ec1332c9c2428c92f977192acc"
|
||||
],
|
||||
"markers": "python_version >= '2.7'",
|
||||
"version": "==1.1.0"
|
||||
},
|
||||
"certifi": {
|
||||
"hashes": [
|
||||
"sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872",
|
||||
"sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569"
|
||||
],
|
||||
"version": "==2021.10.8"
|
||||
},
|
||||
"distlib": {
|
||||
"hashes": [
|
||||
"sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31",
|
||||
"sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05"
|
||||
],
|
||||
"version": "==0.3.3"
|
||||
},
|
||||
"filelock": {
|
||||
"hashes": [
|
||||
"sha256:2b5eb3589e7fdda14599e7eb1a50e09b4cc14f34ed98b8ba56d33bfaafcbef2f",
|
||||
"sha256:34a9f35f95c441e7b38209775d6e0337f9a3759f3565f6c5798f19618527c76f"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.3.1"
|
||||
},
|
||||
"flake8": {
|
||||
"hashes": [
|
||||
"sha256:479b1304f72536a55948cb40a32dce8bb0ffe3501e26eaf292c7e60eb5e0428d",
|
||||
"sha256:806e034dda44114815e23c16ef92f95c91e4c71100ff52813adf7132a6ad870d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==4.0.1"
|
||||
},
|
||||
"importlib-metadata": {
|
||||
"hashes": [
|
||||
"sha256:b618b6d2d5ffa2f16add5697cf57a46c76a56229b0ed1c438322e4e95645bd15",
|
||||
"sha256:f284b3e11256ad1e5d03ab86bb2ccd6f5339688ff17a4d797a0fe7df326f23b1"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==4.8.1"
|
||||
},
|
||||
"mccabe": {
|
||||
"hashes": [
|
||||
"sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
|
||||
"sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
|
||||
],
|
||||
"version": "==0.6.1"
|
||||
},
|
||||
"mypy": {
|
||||
"hashes": [
|
||||
"sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9",
|
||||
"sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a",
|
||||
"sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9",
|
||||
"sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e",
|
||||
"sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2",
|
||||
"sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212",
|
||||
"sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b",
|
||||
"sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885",
|
||||
"sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150",
|
||||
"sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703",
|
||||
"sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072",
|
||||
"sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457",
|
||||
"sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e",
|
||||
"sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0",
|
||||
"sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb",
|
||||
"sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97",
|
||||
"sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8",
|
||||
"sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811",
|
||||
"sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6",
|
||||
"sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de",
|
||||
"sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504",
|
||||
"sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921",
|
||||
"sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.910"
|
||||
},
|
||||
"mypy-extensions": {
|
||||
"hashes": [
|
||||
"sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d",
|
||||
"sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8"
|
||||
],
|
||||
"version": "==0.4.3"
|
||||
},
|
||||
"pipenv": {
|
||||
"hashes": [
|
||||
"sha256:05958fadcd70b2de6a27542fcd2bd72dd5c59c6d35307fdac3e06361fb06e30e",
|
||||
"sha256:d180f5be4775c552fd5e69ae18a9d6099d9dafb462efe54f11c72cb5f4d5e977"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2021.5.29"
|
||||
},
|
||||
"platformdirs": {
|
||||
"hashes": [
|
||||
"sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2",
|
||||
"sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"pycodestyle": {
|
||||
"hashes": [
|
||||
"sha256:720f8b39dde8b293825e7ff02c475f3077124006db4f440dcbc9a20b76548a20",
|
||||
"sha256:eddd5847ef438ea1c7870ca7eb78a9d47ce0cdb4851a5523949f2601d0cbbe7f"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==2.8.0"
|
||||
},
|
||||
"pyflakes": {
|
||||
"hashes": [
|
||||
"sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c",
|
||||
"sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"six": {
|
||||
"hashes": [
|
||||
"sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
|
||||
"sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.16.0"
|
||||
},
|
||||
"toml": {
|
||||
"hashes": [
|
||||
"sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b",
|
||||
"sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"
|
||||
],
|
||||
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.10.2"
|
||||
},
|
||||
"typed-ast": {
|
||||
"hashes": [
|
||||
"sha256:01ae5f73431d21eead5015997ab41afa53aa1fbe252f9da060be5dad2c730ace",
|
||||
"sha256:067a74454df670dcaa4e59349a2e5c81e567d8d65458d480a5b3dfecec08c5ff",
|
||||
"sha256:0fb71b8c643187d7492c1f8352f2c15b4c4af3f6338f21681d3681b3dc31a266",
|
||||
"sha256:1b3ead4a96c9101bef08f9f7d1217c096f31667617b58de957f690c92378b528",
|
||||
"sha256:2068531575a125b87a41802130fa7e29f26c09a2833fea68d9a40cf33902eba6",
|
||||
"sha256:209596a4ec71d990d71d5e0d312ac935d86930e6eecff6ccc7007fe54d703808",
|
||||
"sha256:2c726c276d09fc5c414693a2de063f521052d9ea7c240ce553316f70656c84d4",
|
||||
"sha256:398e44cd480f4d2b7ee8d98385ca104e35c81525dd98c519acff1b79bdaac363",
|
||||
"sha256:52b1eb8c83f178ab787f3a4283f68258525f8d70f778a2f6dd54d3b5e5fb4341",
|
||||
"sha256:5feca99c17af94057417d744607b82dd0a664fd5e4ca98061480fd8b14b18d04",
|
||||
"sha256:7538e495704e2ccda9b234b82423a4038f324f3a10c43bc088a1636180f11a41",
|
||||
"sha256:760ad187b1041a154f0e4d0f6aae3e40fdb51d6de16e5c99aedadd9246450e9e",
|
||||
"sha256:777a26c84bea6cd934422ac2e3b78863a37017618b6e5c08f92ef69853e765d3",
|
||||
"sha256:95431a26309a21874005845c21118c83991c63ea800dd44843e42a916aec5899",
|
||||
"sha256:9ad2c92ec681e02baf81fdfa056fe0d818645efa9af1f1cd5fd6f1bd2bdfd805",
|
||||
"sha256:9c6d1a54552b5330bc657b7ef0eae25d00ba7ffe85d9ea8ae6540d2197a3788c",
|
||||
"sha256:aee0c1256be6c07bd3e1263ff920c325b59849dc95392a05f258bb9b259cf39c",
|
||||
"sha256:af3d4a73793725138d6b334d9d247ce7e5f084d96284ed23f22ee626a7b88e39",
|
||||
"sha256:b36b4f3920103a25e1d5d024d155c504080959582b928e91cb608a65c3a49e1a",
|
||||
"sha256:b9574c6f03f685070d859e75c7f9eeca02d6933273b5e69572e5ff9d5e3931c3",
|
||||
"sha256:bff6ad71c81b3bba8fa35f0f1921fb24ff4476235a6e94a26ada2e54370e6da7",
|
||||
"sha256:c190f0899e9f9f8b6b7863debfb739abcb21a5c054f911ca3596d12b8a4c4c7f",
|
||||
"sha256:c907f561b1e83e93fad565bac5ba9c22d96a54e7ea0267c708bffe863cbe4075",
|
||||
"sha256:cae53c389825d3b46fb37538441f75d6aecc4174f615d048321b716df2757fb0",
|
||||
"sha256:dd4a21253f42b8d2b48410cb31fe501d32f8b9fbeb1f55063ad102fe9c425e40",
|
||||
"sha256:dde816ca9dac1d9c01dd504ea5967821606f02e510438120091b84e852367428",
|
||||
"sha256:f2362f3cb0f3172c42938946dbc5b7843c2a28aec307c49100c8b38764eb6927",
|
||||
"sha256:f328adcfebed9f11301eaedfa48e15bdece9b519fb27e6a8c01aa52a17ec31b3",
|
||||
"sha256:f8afcf15cc511ada719a88e013cec87c11aff7b91f019295eb4530f96fe5ef2f",
|
||||
"sha256:fb1bbeac803adea29cedd70781399c99138358c26d05fcbd23c13016b7f5ec65"
|
||||
],
|
||||
"markers": "python_version < '3.8'",
|
||||
"version": "==1.4.3"
|
||||
},
|
||||
"types-psycopg2": {
|
||||
"hashes": [
|
||||
"sha256:77ed80f2668582654623e04fb3d741ecce93effcc39c929d7e02f4a917a538ce",
|
||||
"sha256:98a6e0e9580cd7eb4bd4d20f7c7063d154b2589a2b90c0ce4e3ca6085cde77c6"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.9.1"
|
||||
},
|
||||
"types-requests": {
|
||||
"hashes": [
|
||||
"sha256:b279284e51f668e38ee12d9665e4d789089f532dc2a0be4a1508ca0efd98ba9e",
|
||||
"sha256:ba1d108d512e294b6080c37f6ae7cb2a2abf527560e2b671d1786c1fc46b541a"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==2.25.11"
|
||||
},
|
||||
"typing-extensions": {
|
||||
"hashes": [
|
||||
"sha256:49f75d16ff11f1cd258e1b988ccff82a3ca5570217d7ad8c5f48205dd99a677e",
|
||||
"sha256:d8226d10bc02a29bcc81df19a26e56a9647f8b0a6d4a83924139f4a8b01f17b7",
|
||||
"sha256:f1d25edafde516b146ecd0613dabcc61409817af4766fbbcfb8d1ad4ec441a34"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.10.0.2"
|
||||
},
|
||||
"virtualenv": {
|
||||
"hashes": [
|
||||
"sha256:10062e34c204b5e4ec5f62e6ef2473f8ba76513a9a617e873f1f8fb4a519d300",
|
||||
"sha256:bcc17f0b3a29670dd777d6f0755a4c04f28815395bca279cdcb213b97199a6b8"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
|
||||
"version": "==20.8.1"
|
||||
},
|
||||
"virtualenv-clone": {
|
||||
"hashes": [
|
||||
"sha256:418ee935c36152f8f153c79824bb93eaf6f0f7984bae31d3f48f350b9183501a",
|
||||
"sha256:44d5263bceed0bac3e1424d64f798095233b64def1c5689afa43dc3223caf5b0"
|
||||
],
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==0.5.7"
|
||||
},
|
||||
"yapf": {
|
||||
"hashes": [
|
||||
"sha256:408fb9a2b254c302f49db83c59f9aa0b4b0fd0ec25be3a5c51181327922ff63d",
|
||||
"sha256:e3a234ba8455fe201eaa649cdac872d590089a18b661e39bbac7020978dd9c2e"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==0.31.0"
|
||||
},
|
||||
"zipp": {
|
||||
"hashes": [
|
||||
"sha256:71c644c5369f4a6e07636f0aa966270449561fcea2e3d6747b8d23efaa9d7832",
|
||||
"sha256:9fe5ea21568a0a70e50f273397638d39b03353731e6cbbb3fd8502a33fec40bc"
|
||||
],
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==3.6.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,13 +3,21 @@
|
||||
This directory contains integration tests.
|
||||
|
||||
Prerequisites:
|
||||
- Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
|
||||
- Python 3.7 or later
|
||||
- Development headers may also be needed to build `psycopg2` from source.
|
||||
- Python 3.7 is recommended if you want to update tests.
|
||||
- Dependencies: install them via `pipenv install`. Note that Debian/Ubuntu
|
||||
packages are stale, as it commonly happens, so manual installation is not
|
||||
recommended.
|
||||
Exact version of `pipenv` is not important unless you change dependencies.
|
||||
Run `pipenv shell` to activate the venv or use `pipenv run` to run a single
|
||||
command in the venv, e.g. `pipenv run pytest`.
|
||||
- Zenith and Postgres binaries
|
||||
- See the root [README.md](/README.md) for build directions
|
||||
- See the root README.md for build directions
|
||||
- Tests can be run from the git tree; or see the environment variables
|
||||
below to run from other directories.
|
||||
- The zenith git repo, including the postgres submodule
|
||||
(for some tests, e.g. `pg_regress`)
|
||||
(for some tests, e.g. pg_regress)
|
||||
|
||||
### Test Organization
|
||||
|
||||
@@ -30,15 +38,15 @@ be stored under a directory `test_output`.
|
||||
|
||||
You can run all the tests with:
|
||||
|
||||
`pipenv run pytest`
|
||||
`pytest`
|
||||
|
||||
If you want to run all the tests in a particular file:
|
||||
|
||||
`pipenv run pytest test_pgbench.py`
|
||||
`pytest test_pgbench.py`
|
||||
|
||||
If you want to run all tests that have the string "bench" in their names:
|
||||
|
||||
`pipenv run pytest -k bench`
|
||||
`pytest -k bench`
|
||||
|
||||
Useful environment variables:
|
||||
|
||||
@@ -97,11 +105,47 @@ don't need to worry about cleaning up. Logs and test data are preserved for the
|
||||
in a directory under `../test_output/<testname>`
|
||||
|
||||
### Before submitting a patch
|
||||
Ensure that you pass all [obligatory checks](/docs/sourcetree.md#obligatory-checks).
|
||||
#### Obligatory checks
|
||||
Install dev dependencies via `pipenv --python 3.7 install --dev` (better)
|
||||
or `pipenv install --dev` (if you don't have Python 3.7 and don't need to change dependencies).
|
||||
|
||||
Also consider:
|
||||
We force code formatting via yapf and type hints via mypy.
|
||||
Run the following commands in the `test_runner/` directory:
|
||||
|
||||
```bash
|
||||
pipenv run yapf -ri . # All code is reformatted
|
||||
pipenv run mypy . # Ensure there are no typing errors
|
||||
```
|
||||
|
||||
#### Advisable actions
|
||||
* Writing a couple of docstrings to clarify the reasoning behind a new test.
|
||||
* Running `flake8` (or a linter of your choice, e.g. `pycodestyle`) and fixing possible defects, if any.
|
||||
* Adding more type hints to your code to avoid `Any`, especially:
|
||||
* For fixture parameters, they are not automatically deduced.
|
||||
* For function arguments and return values.
|
||||
|
||||
#### Changing dependencies
|
||||
You have to update `Pipfile.lock` if you have changed `Pipfile`:
|
||||
|
||||
```bash
|
||||
pipenv --python 3.7 install --dev # Re-create venv for Python 3.7 and install recent pipenv inside
|
||||
pipenv run pipenv --version # Should be at least 2021.5.29
|
||||
pipenv run pipenv lock # Regenerate Pipfile.lock
|
||||
```
|
||||
|
||||
As the minimal supported version is Python 3.7 and we use it in CI,
|
||||
you have to use a Python 3.7 environment when updating `Pipfile.lock`.
|
||||
Otherwise some back-compatibility packages will be missing.
|
||||
|
||||
It is also important to run recent `pipenv`.
|
||||
Older versions remove markers from `Pipfile.lock`.
|
||||
|
||||
If you don't have Python 3.7, you should install it and its headers (for `psycopg2`)
|
||||
separately, e.g.:
|
||||
|
||||
```bash
|
||||
# In Ubuntu
|
||||
sudo add-apt-repository ppa:deadsnakes/ppa
|
||||
sudo apt update
|
||||
sudo apt install python3.7 python3.7-dev
|
||||
```
|
||||
|
||||
@@ -1,11 +1,6 @@
|
||||
import subprocess
|
||||
from contextlib import closing
|
||||
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.utils import print_gc_result
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -24,16 +19,8 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
main_cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = main_cur.fetchone()[0]
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)')
|
||||
|
||||
# keep some early lsn to test branch creation on out of date lsn
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
gced_lsn = main_cur.fetchone()[0]
|
||||
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
@@ -100,27 +87,10 @@ def test_branch_behind(zenith_simple_env: ZenithEnv):
|
||||
assert cur.fetchone() == (1, )
|
||||
|
||||
# branch at pre-initdb lsn
|
||||
with pytest.raises(Exception, match="invalid branch start lsn"):
|
||||
#
|
||||
# FIXME: This works currently, but probably shouldn't be allowed
|
||||
try:
|
||||
env.zenith_cli(["branch", "test_branch_preinitdb", "test_branch_behind@0/42"])
|
||||
|
||||
# check that we cannot create branch based on garbage collected data
|
||||
with closing(env.pageserver.connect()) as psconn:
|
||||
with psconn.cursor(cursor_factory=psycopg2.extras.DictCursor) as pscur:
|
||||
# call gc to advace latest_gc_cutoff_lsn
|
||||
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
|
||||
row = pscur.fetchone()
|
||||
print_gc_result(row)
|
||||
|
||||
with pytest.raises(Exception, match="invalid branch start lsn"):
|
||||
# this gced_lsn is pretty random, so if gc is disabled this woudln't fail
|
||||
env.zenith_cli(["branch", "test_branch_create_fail", f"test_branch_behind@{gced_lsn}"])
|
||||
|
||||
# check that after gc everything is still there
|
||||
hundred_cur.execute('SELECT count(*) FROM foo')
|
||||
assert hundred_cur.fetchone() == (100, )
|
||||
|
||||
more_cur.execute('SELECT count(*) FROM foo')
|
||||
assert more_cur.fetchone() == (200100, )
|
||||
|
||||
main_cur.execute('SELECT count(*) FROM foo')
|
||||
assert main_cur.fetchone() == (400100, )
|
||||
# FIXME: assert false, "branch with invalid LSN should have failed"
|
||||
except subprocess.CalledProcessError:
|
||||
log.info("Branch creation with pre-initdb LSN failed (as expected)")
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
from io import BytesIO
|
||||
import asyncio
|
||||
import asyncpg
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import ZenithEnv, Postgres
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
async def repeat_bytes(buf, repetitions: int):
|
||||
for i in range(repetitions):
|
||||
yield buf
|
||||
|
||||
|
||||
async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str):
|
||||
buf = BytesIO()
|
||||
for i in range(1000):
|
||||
buf.write(
|
||||
f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode())
|
||||
buf.seek(0)
|
||||
|
||||
copy_input = repeat_bytes(buf.read(), 5000)
|
||||
|
||||
pg_conn = await pg.connect_async()
|
||||
await pg_conn.copy_to_table(table_name, source=copy_input)
|
||||
|
||||
|
||||
async def parallel_load_same_table(pg: Postgres, n_parallel: int):
|
||||
workers = []
|
||||
for worker_id in range(n_parallel):
|
||||
worker = copy_test_data_to_table(pg, worker_id, f'copytest')
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
# await all workers
|
||||
await asyncio.gather(*workers)
|
||||
|
||||
|
||||
# Load data into one table with COPY TO from 5 parallel connections
|
||||
def test_parallel_copy(zenith_simple_env: ZenithEnv, n_parallel=5):
|
||||
env = zenith_simple_env
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_parallel_copy", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_parallel_copy')
|
||||
log.info("postgres is running on 'test_parallel_copy' branch")
|
||||
|
||||
# Create test table
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute(f'CREATE TABLE copytest (i int, t text)')
|
||||
|
||||
# Run COPY TO to load the table with parallel connections.
|
||||
asyncio.run(parallel_load_same_table(pg, n_parallel))
|
||||
@@ -1,5 +1,4 @@
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
import subprocess
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
@@ -16,14 +15,13 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
|
||||
env.zenith_cli(["branch", "test_readonly_node", "empty"])
|
||||
|
||||
pgmain = env.postgres.create_start('test_readonly_node')
|
||||
log.info("postgres is running on 'test_readonly_node' branch")
|
||||
print("postgres is running on 'test_readonly_node' branch")
|
||||
|
||||
main_pg_conn = pgmain.connect()
|
||||
main_cur = main_pg_conn.cursor()
|
||||
|
||||
# Create table, and insert the first 100 rows
|
||||
main_cur.execute('CREATE TABLE foo (t text)')
|
||||
|
||||
main_cur.execute('''
|
||||
INSERT INTO foo
|
||||
SELECT 'long string to consume some space' || g
|
||||
@@ -31,7 +29,7 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_a = main_cur.fetchone()[0]
|
||||
log.info('LSN after 100 rows: ' + lsn_a)
|
||||
print('LSN after 100 rows: ' + lsn_a)
|
||||
|
||||
# Insert some more rows. (This generates enough WAL to fill a few segments.)
|
||||
main_cur.execute('''
|
||||
@@ -41,7 +39,7 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
|
||||
''')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_b = main_cur.fetchone()[0]
|
||||
log.info('LSN after 200100 rows: ' + lsn_b)
|
||||
print('LSN after 200100 rows: ' + lsn_b)
|
||||
|
||||
# Insert many more rows. This generates enough WAL to fill a few segments.
|
||||
main_cur.execute('''
|
||||
@@ -52,7 +50,7 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
|
||||
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn_c = main_cur.fetchone()[0]
|
||||
log.info('LSN after 400100 rows: ' + lsn_c)
|
||||
print('LSN after 400100 rows: ' + lsn_c)
|
||||
|
||||
# Create first read-only node at the point where only 100 rows were inserted
|
||||
pg_hundred = env.postgres.create_start("test_readonly_node_hundred",
|
||||
@@ -86,6 +84,8 @@ def test_readonly_node(zenith_simple_env: ZenithEnv):
|
||||
assert cur.fetchone() == (1, )
|
||||
|
||||
# Create node at pre-initdb lsn
|
||||
with pytest.raises(Exception, match="invalid basebackup lsn"):
|
||||
# compute node startup with invalid LSN should fail
|
||||
env.zenith_cli(["pg", "start", "test_readonly_node_preinitdb", "test_readonly_node@0/42"])
|
||||
try:
|
||||
env.zenith_cli(["pg", "start", "test_branch_preinitdb", "test_readonly_node@0/42"])
|
||||
assert False, "compute node startup with invalid LSN should have failed"
|
||||
except Exception:
|
||||
print("Node creation with pre-initdb LSN failed (as expected)")
|
||||
|
||||
@@ -1,13 +1,22 @@
|
||||
from contextlib import closing
|
||||
import psycopg2.extras
|
||||
import time
|
||||
from fixtures.utils import print_gc_result
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
|
||||
def print_gc_result(row):
|
||||
log.info("GC duration {elapsed} ms".format_map(row))
|
||||
log.info(
|
||||
" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
|
||||
.format_map(row))
|
||||
log.info(
|
||||
" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
|
||||
.format_map(row))
|
||||
|
||||
|
||||
#
|
||||
# Test Garbage Collection of old layer files
|
||||
#
|
||||
|
||||
@@ -3,17 +3,13 @@ import random
|
||||
import time
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
from contextlib import closing
|
||||
from dataclasses import dataclass, field
|
||||
from multiprocessing import Process, Value
|
||||
from fixtures.zenith_fixtures import PgBin, ZenithEnv, ZenithEnvBuilder
|
||||
from fixtures.utils import lsn_to_hex, mkdir_if_needed
|
||||
from fixtures.log_helper import log
|
||||
from typing import List, Optional
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -38,22 +34,13 @@ def test_normal_work(zenith_env_builder: ZenithEnvBuilder):
|
||||
assert cur.fetchone() == (5000050000, )
|
||||
|
||||
|
||||
@dataclass
|
||||
class BranchMetrics:
|
||||
name: str
|
||||
latest_valid_lsn: int
|
||||
# One entry per each Safekeeper, order is the same
|
||||
flush_lsns: List[int] = field(default_factory=list)
|
||||
commit_lsns: List[int] = field(default_factory=list)
|
||||
|
||||
|
||||
# Run page server and multiple acceptors, and multiple compute nodes running
|
||||
# against different timelines.
|
||||
def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
|
||||
zenith_env_builder.num_safekeepers = 3
|
||||
env = zenith_env_builder.init()
|
||||
|
||||
n_timelines = 3
|
||||
n_timelines = 2
|
||||
|
||||
branches = ["test_wal_acceptors_many_timelines_{}".format(tlin) for tlin in range(n_timelines)]
|
||||
|
||||
@@ -63,114 +50,21 @@ def test_many_timelines(zenith_env_builder: ZenithEnvBuilder):
|
||||
env.zenith_cli(["branch", branch, "main"])
|
||||
pgs.append(env.postgres.create_start(branch))
|
||||
|
||||
tenant_id = uuid.UUID(env.initial_tenant)
|
||||
|
||||
def collect_metrics(message: str) -> List[BranchMetrics]:
|
||||
with env.pageserver.http_client() as pageserver_http:
|
||||
branch_details = [
|
||||
pageserver_http.branch_detail(tenant_id=tenant_id, name=branch)
|
||||
for branch in branches
|
||||
]
|
||||
# All changes visible to pageserver (latest_valid_lsn) should be
|
||||
# confirmed by safekeepers first. As we cannot atomically get
|
||||
# state of both pageserver and safekeepers, we should start with
|
||||
# pageserver. Looking at outdated data from pageserver is ok.
|
||||
# Asking safekeepers first is not ok because new commits may arrive
|
||||
# to both safekeepers and pageserver after we've already obtained
|
||||
# safekeepers' state, it will look contradictory.
|
||||
sk_metrics = [sk.http_client().get_metrics() for sk in env.safekeepers]
|
||||
|
||||
branch_metrics = []
|
||||
with env.pageserver.http_client() as pageserver_http:
|
||||
for branch_detail in branch_details:
|
||||
timeline_id: str = branch_detail["timeline_id"]
|
||||
|
||||
m = BranchMetrics(
|
||||
name=branch_detail["name"],
|
||||
latest_valid_lsn=branch_detail["latest_valid_lsn"],
|
||||
)
|
||||
for sk_m in sk_metrics:
|
||||
m.flush_lsns.append(sk_m.flush_lsn_inexact[timeline_id])
|
||||
m.commit_lsns.append(sk_m.commit_lsn_inexact[timeline_id])
|
||||
|
||||
for flush_lsn, commit_lsn in zip(m.flush_lsns, m.commit_lsns):
|
||||
# Invariant. May be < when transaction is in progress.
|
||||
assert commit_lsn <= flush_lsn
|
||||
# We only call collect_metrics() after a transaction is confirmed by
|
||||
# the compute node, which only happens after a consensus of safekeepers
|
||||
# has confirmed the transaction. We assume majority consensus here.
|
||||
assert (2 * sum(m.latest_valid_lsn <= lsn
|
||||
for lsn in m.flush_lsns) > zenith_env_builder.num_safekeepers)
|
||||
assert (2 * sum(m.latest_valid_lsn <= lsn
|
||||
for lsn in m.commit_lsns) > zenith_env_builder.num_safekeepers)
|
||||
branch_metrics.append(m)
|
||||
log.info(f"{message}: {branch_metrics}")
|
||||
return branch_metrics
|
||||
|
||||
# TODO: https://github.com/zenithdb/zenith/issues/809
|
||||
# collect_metrics("before CREATE TABLE")
|
||||
|
||||
# Do everything in different loops to have actions on different timelines
|
||||
# interleaved.
|
||||
# create schema
|
||||
for pg in pgs:
|
||||
pg.safe_psql("CREATE TABLE t(key int primary key, value text)")
|
||||
init_m = collect_metrics("after CREATE TABLE")
|
||||
|
||||
# Populate data for 2/3 branches
|
||||
class MetricsChecker(threading.Thread):
|
||||
def __init__(self) -> None:
|
||||
super().__init__(daemon=True)
|
||||
self.should_stop = threading.Event()
|
||||
self.exception: Optional[BaseException] = None
|
||||
|
||||
def run(self) -> None:
|
||||
try:
|
||||
while not self.should_stop.is_set():
|
||||
collect_metrics("during INSERT INTO")
|
||||
time.sleep(1)
|
||||
except:
|
||||
log.error("MetricsChecker's thread failed, the test will be failed on .stop() call",
|
||||
exc_info=True)
|
||||
# We want to preserve traceback as well as the exception
|
||||
exc_type, exc_value, exc_tb = sys.exc_info()
|
||||
assert exc_type
|
||||
e = exc_type(exc_value)
|
||||
e.__traceback__ = exc_tb
|
||||
self.exception = e
|
||||
|
||||
def stop(self) -> None:
|
||||
self.should_stop.set()
|
||||
self.join()
|
||||
if self.exception:
|
||||
raise self.exception
|
||||
|
||||
metrics_checker = MetricsChecker()
|
||||
metrics_checker.start()
|
||||
|
||||
for pg in pgs[:-1]:
|
||||
# Populate data
|
||||
for pg in pgs:
|
||||
pg.safe_psql("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
|
||||
|
||||
metrics_checker.stop()
|
||||
|
||||
collect_metrics("after INSERT INTO")
|
||||
|
||||
# Check data for 2/3 branches
|
||||
for pg in pgs[:-1]:
|
||||
# Check data
|
||||
for pg in pgs:
|
||||
res = pg.safe_psql("SELECT sum(key) FROM t")
|
||||
assert res[0] == (5000050000, )
|
||||
|
||||
final_m = collect_metrics("after SELECT")
|
||||
# Assume that LSNs (a) behave similarly in all branches; and (b) INSERT INTO alters LSN significantly.
|
||||
# Also assume that safekeepers will not be significantly out of sync in this test.
|
||||
middle_lsn = (init_m[0].latest_valid_lsn + final_m[0].latest_valid_lsn) // 2
|
||||
assert max(init_m[0].flush_lsns) < middle_lsn < min(final_m[0].flush_lsns)
|
||||
assert max(init_m[0].commit_lsns) < middle_lsn < min(final_m[0].commit_lsns)
|
||||
assert max(init_m[1].flush_lsns) < middle_lsn < min(final_m[1].flush_lsns)
|
||||
assert max(init_m[1].commit_lsns) < middle_lsn < min(final_m[1].commit_lsns)
|
||||
assert max(init_m[2].flush_lsns) <= min(final_m[2].flush_lsns) < middle_lsn
|
||||
assert max(init_m[2].commit_lsns) <= min(final_m[2].commit_lsns) < middle_lsn
|
||||
|
||||
|
||||
# Check that dead minority doesn't prevent the commits: execute insert n_inserts
|
||||
# times, with fault_probability chance of getting a wal acceptor down or up
|
||||
@@ -392,7 +286,6 @@ def test_sync_safekeepers(zenith_env_builder: ZenithEnvBuilder, pg_bin: PgBin):
|
||||
"lm_prefix": "prefix",
|
||||
"lm_message": "message",
|
||||
"set_commit_lsn": True,
|
||||
"send_proposer_elected": True,
|
||||
"term": 2,
|
||||
"begin_lsn": begin_lsn,
|
||||
"epoch_start_lsn": epoch_start_lsn,
|
||||
|
||||
@@ -109,7 +109,7 @@ async def wait_for_lsn(safekeeper: Safekeeper,
|
||||
timeline_id: str,
|
||||
wait_lsn: str,
|
||||
polling_interval=1,
|
||||
timeout=60):
|
||||
timeout=600):
|
||||
"""
|
||||
Poll flush_lsn from safekeeper until it's greater or equal than
|
||||
provided wait_lsn. To do that, timeline_status is fetched from
|
||||
@@ -147,11 +147,6 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w
|
||||
period_time = 10
|
||||
iterations = 6
|
||||
|
||||
# Set timeout for this test at 5 minutes. It should be enough for test to complete
|
||||
# and less than CircleCI's no_output_timeout, taking into account that this timeout
|
||||
# is checked only at the beginning of every iteration.
|
||||
test_timeout_at = time.monotonic() + 5 * 60
|
||||
|
||||
pg_conn = await pg.connect_async()
|
||||
tenant_id = await pg_conn.fetchval("show zenith.zenith_tenant")
|
||||
timeline_id = await pg_conn.fetchval("show zenith.zenith_timeline")
|
||||
@@ -167,8 +162,6 @@ async def run_restarts_under_load(pg: Postgres, acceptors: List[Safekeeper], n_w
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
for it in range(iterations):
|
||||
assert time.monotonic() < test_timeout_at, 'test timed out'
|
||||
|
||||
victim_idx = it % len(acceptors)
|
||||
victim = acceptors[victim_idx]
|
||||
victim.stop()
|
||||
|
||||
@@ -1 +1 @@
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
pytest_plugins = ("fixtures.zenith_fixtures")
|
||||
|
||||
@@ -1,22 +1,29 @@
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
import subprocess
|
||||
import timeit
|
||||
import calendar
|
||||
import enum
|
||||
from datetime import datetime
|
||||
import pathlib
|
||||
import uuid
|
||||
import psycopg2
|
||||
import pytest
|
||||
from _pytest.config import Config
|
||||
from _pytest.runner import CallInfo
|
||||
from _pytest.terminal import TerminalReporter
|
||||
import warnings
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from contextlib import contextmanager
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
# Type-related stuff
|
||||
from typing import Iterator
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
|
||||
from typing_extensions import Literal
|
||||
|
||||
from .utils import (get_self_dir, mkdir_if_needed, subprocess_capture)
|
||||
"""
|
||||
This file contains fixtures for micro-benchmarks.
|
||||
|
||||
@@ -32,7 +39,7 @@ def test_mybench(zenith_simple_env: env, zenbenchmark):
|
||||
|
||||
# Initialize the test
|
||||
...
|
||||
|
||||
|
||||
# Run the test, timing how long it takes
|
||||
with zenbenchmark.record_duration('test_query'):
|
||||
cur.execute('SELECT test_query(...)')
|
||||
@@ -48,91 +55,36 @@ in the test initialization, or measure disk usage after the test query.
|
||||
"""
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PgBenchRunResult:
|
||||
scale: int
|
||||
number_of_clients: int
|
||||
number_of_threads: int
|
||||
number_of_transactions_actually_processed: int
|
||||
latency_average: float
|
||||
latency_stddev: float
|
||||
tps_including_connection_time: float
|
||||
tps_excluding_connection_time: float
|
||||
init_duration: float
|
||||
init_start_timestamp: int
|
||||
init_end_timestamp: int
|
||||
run_duration: float
|
||||
run_start_timestamp: int
|
||||
run_end_timestamp: int
|
||||
# TODO: It would perhaps be better to store the results as additional
|
||||
# properties in the pytest TestReport objects, to make them visible to
|
||||
# other pytest tools.
|
||||
class ZenithBenchmarkResults:
|
||||
""" An object for recording benchmark results. """
|
||||
def __init__(self):
|
||||
self.results = []
|
||||
|
||||
# TODO progress
|
||||
def record(self, test_name: str, metric_name: str, metric_value: float, unit: str):
|
||||
"""
|
||||
Record a benchmark result.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def parse_from_output(
|
||||
cls,
|
||||
out: 'subprocess.CompletedProcess[str]',
|
||||
init_duration: float,
|
||||
init_start_timestamp: int,
|
||||
init_end_timestamp: int,
|
||||
run_duration: float,
|
||||
run_start_timestamp: int,
|
||||
run_end_timestamp: int,
|
||||
):
|
||||
stdout_lines = out.stdout.splitlines()
|
||||
# we know significant parts of these values from test input
|
||||
# but to be precise take them from output
|
||||
# scaling factor: 5
|
||||
assert "scaling factor" in stdout_lines[1]
|
||||
scale = int(stdout_lines[1].split()[-1])
|
||||
# number of clients: 1
|
||||
assert "number of clients" in stdout_lines[3]
|
||||
number_of_clients = int(stdout_lines[3].split()[-1])
|
||||
# number of threads: 1
|
||||
assert "number of threads" in stdout_lines[4]
|
||||
number_of_threads = int(stdout_lines[4].split()[-1])
|
||||
# number of transactions actually processed: 1000/1000
|
||||
assert "number of transactions actually processed" in stdout_lines[6]
|
||||
number_of_transactions_actually_processed = int(stdout_lines[6].split("/")[1])
|
||||
# latency average = 19.894 ms
|
||||
assert "latency average" in stdout_lines[7]
|
||||
latency_average = stdout_lines[7].split()[-2]
|
||||
# latency stddev = 3.387 ms
|
||||
assert "latency stddev" in stdout_lines[8]
|
||||
latency_stddev = stdout_lines[8].split()[-2]
|
||||
# tps = 50.219689 (including connections establishing)
|
||||
assert "(including connections establishing)" in stdout_lines[9]
|
||||
tps_including_connection_time = stdout_lines[9].split()[2]
|
||||
# tps = 50.264435 (excluding connections establishing)
|
||||
assert "(excluding connections establishing)" in stdout_lines[10]
|
||||
tps_excluding_connection_time = stdout_lines[10].split()[2]
|
||||
|
||||
return cls(
|
||||
scale=scale,
|
||||
number_of_clients=number_of_clients,
|
||||
number_of_threads=number_of_threads,
|
||||
number_of_transactions_actually_processed=number_of_transactions_actually_processed,
|
||||
latency_average=float(latency_average),
|
||||
latency_stddev=float(latency_stddev),
|
||||
tps_including_connection_time=float(tps_including_connection_time),
|
||||
tps_excluding_connection_time=float(tps_excluding_connection_time),
|
||||
init_duration=init_duration,
|
||||
init_start_timestamp=init_start_timestamp,
|
||||
init_end_timestamp=init_end_timestamp,
|
||||
run_duration=run_duration,
|
||||
run_start_timestamp=run_start_timestamp,
|
||||
run_end_timestamp=run_end_timestamp,
|
||||
)
|
||||
self.results.append((test_name, metric_name, metric_value, unit))
|
||||
|
||||
|
||||
@enum.unique
|
||||
class MetricReport(str, enum.Enum): # str is a hack to make it json serializable
|
||||
# this means that this is a constant test parameter
|
||||
# like number of transactions, or number of clients
|
||||
TEST_PARAM = 'test_param'
|
||||
# reporter can use it to mark test runs with higher values as improvements
|
||||
HIGHER_IS_BETTER = 'higher_is_better'
|
||||
# the same but for lower values
|
||||
LOWER_IS_BETTER = 'lower_is_better'
|
||||
# Will be recreated in each session.
|
||||
zenbenchmark_results: ZenithBenchmarkResults = ZenithBenchmarkResults()
|
||||
|
||||
|
||||
# Session scope fixture that initializes the results object
|
||||
@pytest.fixture(autouse=True, scope='session')
|
||||
def zenbenchmark_global(request) -> Iterator[ZenithBenchmarkResults]:
|
||||
"""
|
||||
This is a python decorator for benchmark fixtures
|
||||
"""
|
||||
global zenbenchmark_results
|
||||
zenbenchmark_results = ZenithBenchmarkResults()
|
||||
|
||||
yield zenbenchmark_results
|
||||
|
||||
|
||||
class ZenithBenchmarker:
|
||||
@@ -140,109 +92,30 @@ class ZenithBenchmarker:
|
||||
An object for recording benchmark results. This is created for each test
|
||||
function by the zenbenchmark fixture
|
||||
"""
|
||||
def __init__(self, property_recorder):
|
||||
# property recorder here is a pytest fixture provided by junitxml module
|
||||
# https://docs.pytest.org/en/6.2.x/reference.html#pytest.junitxml.record_property
|
||||
self.property_recorder = property_recorder
|
||||
def __init__(self, results, request):
|
||||
self.results = results
|
||||
self.request = request
|
||||
|
||||
def record(
|
||||
self,
|
||||
metric_name: str,
|
||||
metric_value: float,
|
||||
unit: str,
|
||||
report: MetricReport,
|
||||
):
|
||||
def record(self, metric_name: str, metric_value: float, unit: str):
|
||||
"""
|
||||
Record a benchmark result.
|
||||
"""
|
||||
# just to namespace the value
|
||||
name = f"zenith_benchmarker_{metric_name}"
|
||||
self.property_recorder(
|
||||
name,
|
||||
{
|
||||
"name": metric_name,
|
||||
"value": metric_value,
|
||||
"unit": unit,
|
||||
"report": report,
|
||||
},
|
||||
)
|
||||
self.results.record(self.request.node.name, metric_name, metric_value, unit)
|
||||
|
||||
@contextmanager
|
||||
def record_duration(self, metric_name: str):
|
||||
def record_duration(self, metric_name):
|
||||
"""
|
||||
Record a duration. Usage:
|
||||
|
||||
|
||||
with zenbenchmark.record_duration('foobar_runtime'):
|
||||
foobar() # measure this
|
||||
|
||||
"""
|
||||
start = timeit.default_timer()
|
||||
yield
|
||||
end = timeit.default_timer()
|
||||
|
||||
self.record(
|
||||
metric_name=metric_name,
|
||||
metric_value=end - start,
|
||||
unit="s",
|
||||
report=MetricReport.LOWER_IS_BETTER,
|
||||
)
|
||||
|
||||
def record_pg_bench_result(self, pg_bench_result: PgBenchRunResult):
|
||||
self.record("scale", pg_bench_result.scale, '', MetricReport.TEST_PARAM)
|
||||
self.record("number_of_clients",
|
||||
pg_bench_result.number_of_clients,
|
||||
'',
|
||||
MetricReport.TEST_PARAM)
|
||||
self.record("number_of_threads",
|
||||
pg_bench_result.number_of_threads,
|
||||
'',
|
||||
MetricReport.TEST_PARAM)
|
||||
self.record(
|
||||
"number_of_transactions_actually_processed",
|
||||
pg_bench_result.number_of_transactions_actually_processed,
|
||||
'',
|
||||
# thats because this is predefined by test matrix and doesnt change across runs
|
||||
report=MetricReport.TEST_PARAM,
|
||||
)
|
||||
self.record("latency_average",
|
||||
pg_bench_result.latency_average,
|
||||
unit="ms",
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
self.record("latency_stddev",
|
||||
pg_bench_result.latency_stddev,
|
||||
unit="ms",
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
self.record("tps_including_connection_time",
|
||||
pg_bench_result.tps_including_connection_time,
|
||||
'',
|
||||
report=MetricReport.HIGHER_IS_BETTER)
|
||||
self.record("tps_excluding_connection_time",
|
||||
pg_bench_result.tps_excluding_connection_time,
|
||||
'',
|
||||
report=MetricReport.HIGHER_IS_BETTER)
|
||||
self.record("init_duration",
|
||||
pg_bench_result.init_duration,
|
||||
unit="s",
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
self.record("init_start_timestamp",
|
||||
pg_bench_result.init_start_timestamp,
|
||||
'',
|
||||
MetricReport.TEST_PARAM)
|
||||
self.record("init_end_timestamp",
|
||||
pg_bench_result.init_end_timestamp,
|
||||
'',
|
||||
MetricReport.TEST_PARAM)
|
||||
self.record("run_duration",
|
||||
pg_bench_result.run_duration,
|
||||
unit="s",
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
self.record("run_start_timestamp",
|
||||
pg_bench_result.run_start_timestamp,
|
||||
'',
|
||||
MetricReport.TEST_PARAM)
|
||||
self.record("run_end_timestamp",
|
||||
pg_bench_result.run_end_timestamp,
|
||||
'',
|
||||
MetricReport.TEST_PARAM)
|
||||
self.results.record(self.request.node.name, metric_name, end - start, 's')
|
||||
|
||||
def get_io_writes(self, pageserver) -> int:
|
||||
"""
|
||||
@@ -276,7 +149,7 @@ class ZenithBenchmarker:
|
||||
assert matches
|
||||
return int(round(float(matches.group(1))))
|
||||
|
||||
def get_timeline_size(self, repo_dir: Path, tenantid: str, timelineid: str):
|
||||
def get_timeline_size(self, repo_dir: str, tenantid: str, timelineid: str):
|
||||
"""
|
||||
Calculate the on-disk size of a timeline
|
||||
"""
|
||||
@@ -298,90 +171,47 @@ class ZenithBenchmarker:
|
||||
yield
|
||||
after = self.get_io_writes(pageserver)
|
||||
|
||||
self.record(metric_name,
|
||||
round((after - before) / (1024 * 1024)),
|
||||
"MB",
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
self.results.record(self.request.node.name,
|
||||
metric_name,
|
||||
round((after - before) / (1024 * 1024)),
|
||||
'MB')
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def zenbenchmark(record_property) -> Iterator[ZenithBenchmarker]:
|
||||
@pytest.fixture(scope='function')
|
||||
def zenbenchmark(zenbenchmark_global, request) -> Iterator[ZenithBenchmarker]:
|
||||
"""
|
||||
This is a python decorator for benchmark fixtures. It contains functions for
|
||||
recording measurements, and prints them out at the end.
|
||||
"""
|
||||
benchmarker = ZenithBenchmarker(record_property)
|
||||
benchmarker = ZenithBenchmarker(zenbenchmark_global, request)
|
||||
yield benchmarker
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--out-dir",
|
||||
dest="out_dir",
|
||||
help="Directory to ouput performance tests results to.",
|
||||
)
|
||||
|
||||
|
||||
def get_out_path(target_dir: Path, revision: str) -> Path:
|
||||
"""
|
||||
get output file path
|
||||
if running in the CI uses commit revision
|
||||
to avoid duplicates uses counter
|
||||
"""
|
||||
# use UTC timestamp as a counter marker to avoid weird behaviour
|
||||
# when for example files are deleted
|
||||
ts = calendar.timegm(datetime.utcnow().utctimetuple())
|
||||
path = target_dir / f"{ts}_{revision}.json"
|
||||
assert not path.exists()
|
||||
return path
|
||||
|
||||
|
||||
# Hook to print the results at the end
|
||||
@pytest.hookimpl(hookwrapper=True)
|
||||
def pytest_terminal_summary(terminalreporter: TerminalReporter, exitstatus: int, config: Config):
|
||||
yield
|
||||
revision = os.getenv("GITHUB_SHA", "local")
|
||||
platform = os.getenv("PLATFORM", "local")
|
||||
|
||||
terminalreporter.section("Benchmark results", "-")
|
||||
global zenbenchmark_results
|
||||
|
||||
result = []
|
||||
for test_report in terminalreporter.stats.get("passed", []):
|
||||
result_entry = []
|
||||
|
||||
for _, recorded_property in test_report.user_properties:
|
||||
terminalreporter.write("{}.{}: ".format(test_report.head_line,
|
||||
recorded_property["name"]))
|
||||
unit = recorded_property["unit"]
|
||||
value = recorded_property["value"]
|
||||
if unit == "MB":
|
||||
terminalreporter.write("{0:,.0f}".format(value), green=True)
|
||||
elif unit in ("s", "ms") and isinstance(value, float):
|
||||
terminalreporter.write("{0:,.3f}".format(value), green=True)
|
||||
elif isinstance(value, float):
|
||||
terminalreporter.write("{0:,.4f}".format(value), green=True)
|
||||
else:
|
||||
terminalreporter.write(str(value), green=True)
|
||||
terminalreporter.line(" {}".format(unit))
|
||||
|
||||
result_entry.append(recorded_property)
|
||||
|
||||
result.append({
|
||||
"suit": test_report.nodeid,
|
||||
"total_duration": test_report.duration,
|
||||
"data": result_entry,
|
||||
})
|
||||
|
||||
out_dir = config.getoption("out_dir")
|
||||
if out_dir is None:
|
||||
warnings.warn("no out dir provided to store performance test results")
|
||||
if not zenbenchmark_results:
|
||||
return
|
||||
|
||||
if not result:
|
||||
warnings.warn("no results to store (no passed test suites)")
|
||||
return
|
||||
terminalreporter.section('Benchmark results', "-")
|
||||
|
||||
get_out_path(Path(out_dir), revision=revision).write_text(
|
||||
json.dumps({
|
||||
"revision": revision, "platform": platform, "result": result
|
||||
}, indent=4))
|
||||
for result in zenbenchmark_results.results:
|
||||
func = result[0]
|
||||
metric_name = result[1]
|
||||
metric_value = result[2]
|
||||
unit = result[3]
|
||||
|
||||
terminalreporter.write("{}.{}: ".format(func, metric_name))
|
||||
|
||||
if unit == 'MB':
|
||||
terminalreporter.write("{0:,.0f}".format(metric_value), green=True)
|
||||
elif unit == 's':
|
||||
terminalreporter.write("{0:,.3f}".format(metric_value), green=True)
|
||||
else:
|
||||
terminalreporter.write("{0:,.4f}".format(metric_value), green=True)
|
||||
|
||||
terminalreporter.line(" {}".format(unit))
|
||||
|
||||
@@ -69,13 +69,3 @@ def lsn_from_hex(lsn_hex: str) -> int:
|
||||
""" Convert lsn from hex notation to int. """
|
||||
l, r = lsn_hex.split('/')
|
||||
return (int(l, 16) << 32) + int(r, 16)
|
||||
|
||||
|
||||
def print_gc_result(row):
|
||||
log.info("GC duration {elapsed} ms".format_map(row))
|
||||
log.info(
|
||||
" REL total: {layer_relfiles_total}, needed_by_cutoff {layer_relfiles_needed_by_cutoff}, needed_by_branches: {layer_relfiles_needed_by_branches}, not_updated: {layer_relfiles_not_updated}, needed_as_tombstone {layer_relfiles_needed_as_tombstone}, removed: {layer_relfiles_removed}, dropped: {layer_relfiles_dropped}"
|
||||
.format_map(row))
|
||||
log.info(
|
||||
" NONREL total: {layer_nonrelfiles_total}, needed_by_cutoff {layer_nonrelfiles_needed_by_cutoff}, needed_by_branches: {layer_nonrelfiles_needed_by_branches}, not_updated: {layer_nonrelfiles_not_updated}, needed_as_tombstone {layer_nonrelfiles_needed_as_tombstone}, removed: {layer_nonrelfiles_removed}, dropped: {layer_nonrelfiles_dropped}"
|
||||
.format_map(row))
|
||||
|
||||
@@ -1,25 +1,24 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from cached_property import cached_property
|
||||
import asyncpg
|
||||
import os
|
||||
import pathlib
|
||||
import uuid
|
||||
import warnings
|
||||
import jwt
|
||||
import json
|
||||
import psycopg2
|
||||
import pytest
|
||||
import re
|
||||
import shutil
|
||||
import signal
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
import filecmp
|
||||
import tempfile
|
||||
|
||||
from contextlib import closing
|
||||
from contextlib import closing, suppress
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
@@ -27,7 +26,6 @@ from dataclasses import dataclass
|
||||
from psycopg2.extensions import connection as PgConnection
|
||||
from typing import Any, Callable, Dict, Iterator, List, Optional, TypeVar, cast
|
||||
from typing_extensions import Literal
|
||||
import pytest
|
||||
|
||||
import requests
|
||||
|
||||
@@ -60,16 +58,6 @@ DEFAULT_POSTGRES_DIR = 'tmp_install'
|
||||
BASE_PORT = 15000
|
||||
WORKER_PORT_NUM = 100
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--skip-interfering-proc-check",
|
||||
dest="skip_interfering_proc_check",
|
||||
action="store_true",
|
||||
help="skip check for interferring processes",
|
||||
)
|
||||
|
||||
|
||||
# These are set in pytest_configure()
|
||||
base_dir = ""
|
||||
zenith_binpath = ""
|
||||
@@ -77,10 +65,14 @@ pg_distrib_dir = ""
|
||||
top_output_dir = ""
|
||||
|
||||
|
||||
def check_interferring_processes(config):
|
||||
if config.getoption("skip_interfering_proc_check"):
|
||||
warnings.warn("interferring process check is skipped")
|
||||
return
|
||||
def pytest_configure(config):
|
||||
"""
|
||||
Ensure that no unwanted daemons are running before we start testing.
|
||||
Check that we do not owerflow available ports range.
|
||||
"""
|
||||
numprocesses = config.getoption('numprocesses')
|
||||
if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768: # do not use ephemeral ports
|
||||
raise Exception('Too many workers configured. Cannot distrubute ports for services.')
|
||||
|
||||
# does not use -c as it is not supported on macOS
|
||||
cmd = ['pgrep', 'pageserver|postgres|safekeeper']
|
||||
@@ -94,36 +86,11 @@ def check_interferring_processes(config):
|
||||
'Found interfering processes running. Stop all Zenith pageservers, nodes, safekeepers, as well as stand-alone Postgres.'
|
||||
)
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
"""
|
||||
Ensure that no unwanted daemons are running before we start testing.
|
||||
Check that we do not owerflow available ports range.
|
||||
"""
|
||||
check_interferring_processes(config)
|
||||
|
||||
numprocesses = config.getoption('numprocesses')
|
||||
if numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768: # do not use ephemeral ports
|
||||
raise Exception('Too many workers configured. Cannot distrubute ports for services.')
|
||||
|
||||
# find the base directory (currently this is the git root)
|
||||
global base_dir
|
||||
base_dir = os.path.normpath(os.path.join(get_self_dir(), '../..'))
|
||||
log.info(f'base_dir is {base_dir}')
|
||||
|
||||
# Compute the top-level directory for all tests.
|
||||
global top_output_dir
|
||||
env_test_output = os.environ.get('TEST_OUTPUT')
|
||||
if env_test_output is not None:
|
||||
top_output_dir = env_test_output
|
||||
else:
|
||||
top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
|
||||
mkdir_if_needed(top_output_dir)
|
||||
|
||||
if os.getenv("REMOTE_ENV"):
|
||||
# we are in remote env and do not have zenith binaries locally
|
||||
# this is the case for benchmarks run on self-hosted runner
|
||||
return
|
||||
# Find the zenith binaries.
|
||||
global zenith_binpath
|
||||
env_zenith_bin = os.environ.get('ZENITH_BIN')
|
||||
@@ -133,7 +100,7 @@ def pytest_configure(config):
|
||||
zenith_binpath = os.path.join(base_dir, 'target/debug')
|
||||
log.info(f'zenith_binpath is {zenith_binpath}')
|
||||
if not os.path.exists(os.path.join(zenith_binpath, 'pageserver')):
|
||||
raise Exception('zenith binaries not found at "{}"'.format(zenith_binpath))
|
||||
raise Exception('zenith binaries not found at "{}"'.format(zenith_dir))
|
||||
|
||||
# Find the postgres installation.
|
||||
global pg_distrib_dir
|
||||
@@ -146,6 +113,15 @@ def pytest_configure(config):
|
||||
if not os.path.exists(os.path.join(pg_distrib_dir, 'bin/postgres')):
|
||||
raise Exception('postgres not found at "{}"'.format(pg_distrib_dir))
|
||||
|
||||
# Compute the top-level directory for all tests.
|
||||
global top_output_dir
|
||||
env_test_output = os.environ.get('TEST_OUTPUT')
|
||||
if env_test_output is not None:
|
||||
top_output_dir = env_test_output
|
||||
else:
|
||||
top_output_dir = os.path.join(base_dir, DEFAULT_OUTPUT_DIR)
|
||||
mkdir_if_needed(top_output_dir)
|
||||
|
||||
|
||||
def zenfixture(func: Fn) -> Fn:
|
||||
"""
|
||||
@@ -509,12 +485,6 @@ sync = false # Disable fsyncs to make the tests go faster
|
||||
env_vars['ZENITH_REPO_DIR'] = str(self.repo_dir)
|
||||
env_vars['POSTGRES_DISTRIB_DIR'] = str(pg_distrib_dir)
|
||||
|
||||
# Pass coverage settings
|
||||
var = 'LLVM_PROFILE_FILE'
|
||||
val = os.environ.get(var)
|
||||
if val:
|
||||
env_vars[var] = val
|
||||
|
||||
# Intercept CalledProcessError and print more info
|
||||
try:
|
||||
res = subprocess.run(args,
|
||||
@@ -722,7 +692,7 @@ class ZenithPageserver(PgProtocol):
|
||||
def __exit__(self, exc_type, exc, tb):
|
||||
self.stop(True)
|
||||
|
||||
def http_client(self, auth_token: Optional[str] = None) -> ZenithPageserverHttpClient:
|
||||
def http_client(self, auth_token: Optional[str] = None):
|
||||
return ZenithPageserverHttpClient(
|
||||
port=self.service_port.http,
|
||||
auth_token=auth_token,
|
||||
@@ -1091,7 +1061,7 @@ class Safekeeper:
|
||||
assert isinstance(res, dict)
|
||||
return res
|
||||
|
||||
def http_client(self) -> SafekeeperHttpClient:
|
||||
def http_client(self):
|
||||
return SafekeeperHttpClient(port=self.port.http)
|
||||
|
||||
|
||||
@@ -1101,14 +1071,6 @@ class SafekeeperTimelineStatus:
|
||||
flush_lsn: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SafekeeperMetrics:
|
||||
# These are metrics from Prometheus which uses float64 internally.
|
||||
# As a consequence, values may differ from real original int64s.
|
||||
flush_lsn_inexact: Dict[str, int] = field(default_factory=dict)
|
||||
commit_lsn_inexact: Dict[str, int] = field(default_factory=dict)
|
||||
|
||||
|
||||
class SafekeeperHttpClient(requests.Session):
|
||||
def __init__(self, port: int) -> None:
|
||||
super().__init__()
|
||||
@@ -1124,22 +1086,6 @@ class SafekeeperHttpClient(requests.Session):
|
||||
return SafekeeperTimelineStatus(acceptor_epoch=resj['acceptor_state']['epoch'],
|
||||
flush_lsn=resj['flush_lsn'])
|
||||
|
||||
def get_metrics(self) -> SafekeeperMetrics:
|
||||
request_result = self.get(f"http://localhost:{self.port}/metrics")
|
||||
request_result.raise_for_status()
|
||||
all_metrics_text = request_result.text
|
||||
|
||||
metrics = SafekeeperMetrics()
|
||||
for match in re.finditer(r'^safekeeper_flush_lsn{ztli="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE):
|
||||
metrics.flush_lsn_inexact[match.group(1)] = int(match.group(2))
|
||||
for match in re.finditer(r'^safekeeper_commit_lsn{ztli="([0-9a-f]+)"} (\S+)$',
|
||||
all_metrics_text,
|
||||
re.MULTILINE):
|
||||
metrics.commit_lsn_inexact[match.group(1)] = int(match.group(2))
|
||||
return metrics
|
||||
|
||||
|
||||
def get_test_output_dir(request: Any) -> str:
|
||||
""" Compute the working directory for an individual test. """
|
||||
@@ -1225,17 +1171,7 @@ def check_restored_datadir_content(test_output_dir: str, env: ZenithEnv, pg: Pos
|
||||
| tar -x -C {restored_dir_path}
|
||||
"""
|
||||
|
||||
# Set LD_LIBRARY_PATH in the env properly, otherwise we may use the wrong libpq.
|
||||
# PgBin sets it automatically, but here we need to pipe psql output to the tar command.
|
||||
psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}
|
||||
result = subprocess.run(cmd, env=psql_env, capture_output=True, text=True, shell=True)
|
||||
|
||||
# Print captured stdout/stderr if basebackup cmd failed.
|
||||
if result.returncode != 0:
|
||||
log.error('Basebackup shell command failed with:')
|
||||
log.error(result.stdout)
|
||||
log.error(result.stderr)
|
||||
assert result.returncode == 0
|
||||
subprocess.check_call(cmd, shell=True)
|
||||
|
||||
# list files we're going to compare
|
||||
assert pg.pgdata_dir
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
@@ -16,7 +16,7 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
# 3. Disk space used
|
||||
# 4. Peak memory usage
|
||||
#
|
||||
def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark: ZenithBenchmarker):
|
||||
def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark):
|
||||
env = zenith_simple_env
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_bulk_insert", "empty"])
|
||||
@@ -47,16 +47,10 @@ def test_bulk_insert(zenith_simple_env: ZenithEnv, zenbenchmark: ZenithBenchmark
|
||||
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem",
|
||||
zenbenchmark.get_peak_mem(env.pageserver) / 1024,
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
|
||||
env.initial_tenant,
|
||||
timeline)
|
||||
zenbenchmark.record('size',
|
||||
timeline_size / (1024 * 1024),
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import timeit
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
import pytest
|
||||
|
||||
from fixtures.zenith_fixtures import ZenithEnvBuilder
|
||||
@@ -55,7 +54,4 @@ def test_bulk_tenant_create(
|
||||
|
||||
pg_tenant.stop()
|
||||
|
||||
zenbenchmark.record('tenant_creation_time',
|
||||
sum(time_slices) / len(time_slices),
|
||||
's',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record('tenant_creation_time', sum(time_slices) / len(time_slices), 's')
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
@@ -49,16 +48,10 @@ def test_gist_buffering_build(zenith_simple_env: ZenithEnv, zenbenchmark):
|
||||
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 1000000")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem",
|
||||
zenbenchmark.get_peak_mem(env.pageserver) / 1024,
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record("peak_mem", zenbenchmark.get_peak_mem(env.pageserver) / 1024, 'MB')
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
|
||||
env.initial_tenant,
|
||||
timeline)
|
||||
zenbenchmark.record('size',
|
||||
timeline_size / (1024 * 1024),
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
|
||||
@@ -1,140 +0,0 @@
|
||||
from io import BytesIO
|
||||
import asyncio
|
||||
import asyncpg
|
||||
from fixtures.zenith_fixtures import ZenithEnv, Postgres
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
|
||||
async def repeat_bytes(buf, repetitions: int):
|
||||
for i in range(repetitions):
|
||||
yield buf
|
||||
|
||||
|
||||
async def copy_test_data_to_table(pg: Postgres, worker_id: int, table_name: str):
|
||||
buf = BytesIO()
|
||||
for i in range(1000):
|
||||
buf.write(
|
||||
f"{i}\tLoaded by worker {worker_id}. Long string to consume some space.\n".encode())
|
||||
buf.seek(0)
|
||||
|
||||
copy_input = repeat_bytes(buf.read(), 5000)
|
||||
|
||||
pg_conn = await pg.connect_async()
|
||||
await pg_conn.copy_to_table(table_name, source=copy_input)
|
||||
|
||||
|
||||
async def parallel_load_different_tables(pg: Postgres, n_parallel: int):
|
||||
workers = []
|
||||
for worker_id in range(n_parallel):
|
||||
worker = copy_test_data_to_table(pg, worker_id, f'copytest_{worker_id}')
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
# await all workers
|
||||
await asyncio.gather(*workers)
|
||||
|
||||
|
||||
# Load 5 different tables in parallel with COPY TO
|
||||
def test_parallel_copy_different_tables(zenith_simple_env: ZenithEnv,
|
||||
zenbenchmark: ZenithBenchmarker,
|
||||
n_parallel=5):
|
||||
|
||||
env = zenith_simple_env
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_parallel_copy_different_tables", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_parallel_copy_different_tables')
|
||||
log.info("postgres is running on 'test_parallel_copy_different_tables' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = env.pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
for worker_id in range(n_parallel):
|
||||
cur.execute(f'CREATE TABLE copytest_{worker_id} (i int, t text)')
|
||||
|
||||
with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('load'):
|
||||
asyncio.run(parallel_load_different_tables(pg, n_parallel))
|
||||
|
||||
# Flush the layers from memory to disk. This is included in the reported
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem",
|
||||
zenbenchmark.get_peak_mem(env.pageserver) / 1024,
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
|
||||
zenbenchmark.record('size',
|
||||
timeline_size / (1024 * 1024),
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
|
||||
async def parallel_load_same_table(pg: Postgres, n_parallel: int):
|
||||
workers = []
|
||||
for worker_id in range(n_parallel):
|
||||
worker = copy_test_data_to_table(pg, worker_id, f'copytest')
|
||||
workers.append(asyncio.create_task(worker))
|
||||
|
||||
# await all workers
|
||||
await asyncio.gather(*workers)
|
||||
|
||||
|
||||
# Load data into one table with COPY TO from 5 parallel connections
|
||||
def test_parallel_copy_same_table(zenith_simple_env: ZenithEnv,
|
||||
zenbenchmark: ZenithBenchmarker,
|
||||
n_parallel=5):
|
||||
env = zenith_simple_env
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_parallel_copy_same_table", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_parallel_copy_same_table')
|
||||
log.info("postgres is running on 'test_parallel_copy_same_table' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = env.pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
# Get the timeline ID of our branch. We need it for the 'do_gc' command
|
||||
conn = pg.connect()
|
||||
cur = conn.cursor()
|
||||
cur.execute("SHOW zenith.zenith_timeline")
|
||||
timeline = cur.fetchone()[0]
|
||||
|
||||
cur.execute(f'CREATE TABLE copytest (i int, t text)')
|
||||
|
||||
with zenbenchmark.record_pageserver_writes(env.pageserver, 'pageserver_writes'):
|
||||
with zenbenchmark.record_duration('load'):
|
||||
asyncio.run(parallel_load_same_table(pg, n_parallel))
|
||||
|
||||
# Flush the layers from memory to disk. This is included in the reported
|
||||
# time and I/O
|
||||
pscur.execute(f"do_gc {env.initial_tenant} {timeline} 0")
|
||||
|
||||
# Record peak memory usage
|
||||
zenbenchmark.record("peak_mem",
|
||||
zenbenchmark.get_peak_mem(env.pageserver) / 1024,
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
|
||||
zenbenchmark.record('size',
|
||||
timeline_size / (1024 * 1024),
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
@@ -1,7 +1,6 @@
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import PgBin, ZenithEnv
|
||||
|
||||
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
@@ -16,7 +15,7 @@ pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
# 2. Time to run 5000 pgbench transactions
|
||||
# 3. Disk space used
|
||||
#
|
||||
def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin: PgBin, zenbenchmark: ZenithBenchmarker):
|
||||
def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin, zenbenchmark):
|
||||
env = zenith_simple_env
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_pgbench_perf", "empty"])
|
||||
@@ -56,7 +55,4 @@ def test_pgbench(zenith_simple_env: ZenithEnv, pg_bin: PgBin, zenbenchmark: Zeni
|
||||
|
||||
# Report disk space used by the repository
|
||||
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir, env.initial_tenant, timeline)
|
||||
zenbenchmark.record('size',
|
||||
timeline_size / (1024 * 1024),
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
import dataclasses
|
||||
import os
|
||||
import subprocess
|
||||
from typing import List
|
||||
from fixtures.benchmark_fixture import PgBenchRunResult, ZenithBenchmarker
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
import calendar
|
||||
import timeit
|
||||
import os
|
||||
|
||||
pytest_plugins = ("fixtures.benchmark_fixture", )
|
||||
|
||||
|
||||
def utc_now_timestamp() -> int:
|
||||
return calendar.timegm(datetime.utcnow().utctimetuple())
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PgBenchRunner:
|
||||
connstr: str
|
||||
scale: int
|
||||
transactions: int
|
||||
pgbench_bin_path: str = "pgbench"
|
||||
|
||||
def invoke(self, args: List[str]) -> 'subprocess.CompletedProcess[str]':
|
||||
res = subprocess.run([self.pgbench_bin_path, *args], text=True, capture_output=True)
|
||||
|
||||
if res.returncode != 0:
|
||||
raise RuntimeError(f"pgbench failed. stdout: {res.stdout} stderr: {res.stderr}")
|
||||
return res
|
||||
|
||||
def init(self, vacuum: bool = True) -> 'subprocess.CompletedProcess[str]':
|
||||
args = []
|
||||
if not vacuum:
|
||||
args.append("--no-vacuum")
|
||||
args.extend([f"--scale={self.scale}", "--initialize", self.connstr])
|
||||
return self.invoke(args)
|
||||
|
||||
def run(self, jobs: int = 1, clients: int = 1):
|
||||
return self.invoke([
|
||||
f"--transactions={self.transactions}",
|
||||
f"--jobs={jobs}",
|
||||
f"--client={clients}",
|
||||
"--progress=2", # print progress every two seconds
|
||||
self.connstr,
|
||||
])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def connstr():
|
||||
res = os.getenv("BENCHMARK_CONNSTR")
|
||||
if res is None:
|
||||
raise ValueError("no connstr provided, use BENCHMARK_CONNSTR environment variable")
|
||||
return res
|
||||
|
||||
|
||||
def get_transactions_matrix():
|
||||
transactions = os.getenv("TEST_PG_BENCH_TRANSACTIONS_MATRIX")
|
||||
if transactions is None:
|
||||
return [10**4, 10**5]
|
||||
return list(map(int, transactions.split(",")))
|
||||
|
||||
|
||||
def get_scales_matrix():
|
||||
scales = os.getenv("TEST_PG_BENCH_SCALES_MATRIX")
|
||||
if scales is None:
|
||||
return [10, 20]
|
||||
return list(map(int, scales.split(",")))
|
||||
|
||||
|
||||
@pytest.mark.parametrize("scale", get_scales_matrix())
|
||||
@pytest.mark.parametrize("transactions", get_transactions_matrix())
|
||||
@pytest.mark.remote_cluster
|
||||
def test_pg_bench_remote_cluster(zenbenchmark: ZenithBenchmarker,
|
||||
connstr: str,
|
||||
scale: int,
|
||||
transactions: int):
|
||||
"""
|
||||
The best way is to run same pack of tests both, for local zenith
|
||||
and against staging, but currently local tests heavily depend on
|
||||
things available only locally e.g. zenith binaries, pageserver api, etc.
|
||||
Also separate test allows to run pgbench workload against vanilla postgres
|
||||
or other systems that support postgres protocol.
|
||||
|
||||
Also now this is more of a liveness test because it stresses pageserver internals,
|
||||
so we clearly see what goes wrong in more "real" environment.
|
||||
"""
|
||||
pg_bin = os.getenv("PG_BIN")
|
||||
if pg_bin is not None:
|
||||
pgbench_bin_path = os.path.join(pg_bin, "pgbench")
|
||||
else:
|
||||
pgbench_bin_path = "pgbench"
|
||||
|
||||
runner = PgBenchRunner(
|
||||
connstr=connstr,
|
||||
scale=scale,
|
||||
transactions=transactions,
|
||||
pgbench_bin_path=pgbench_bin_path,
|
||||
)
|
||||
# calculate timestamps and durations separately
|
||||
# timestamp is intended to be used for linking to grafana and logs
|
||||
# duration is actually a metric and uses float instead of int for timestamp
|
||||
init_start_timestamp = utc_now_timestamp()
|
||||
t0 = timeit.default_timer()
|
||||
runner.init()
|
||||
init_duration = timeit.default_timer() - t0
|
||||
init_end_timestamp = utc_now_timestamp()
|
||||
|
||||
run_start_timestamp = utc_now_timestamp()
|
||||
t0 = timeit.default_timer()
|
||||
out = runner.run() # TODO handle failures
|
||||
run_duration = timeit.default_timer() - t0
|
||||
run_end_timestamp = utc_now_timestamp()
|
||||
|
||||
res = PgBenchRunResult.parse_from_output(
|
||||
out=out,
|
||||
init_duration=init_duration,
|
||||
init_start_timestamp=init_start_timestamp,
|
||||
init_end_timestamp=init_end_timestamp,
|
||||
run_duration=run_duration,
|
||||
run_start_timestamp=run_start_timestamp,
|
||||
run_end_timestamp=run_end_timestamp,
|
||||
)
|
||||
|
||||
zenbenchmark.record_pg_bench_result(res)
|
||||
@@ -1,43 +0,0 @@
|
||||
# Test sequential scan speed
|
||||
#
|
||||
# The test table is large enough (3-4 MB) that it doesn't fit in the compute node
|
||||
# cache, so the seqscans go to the page server. But small enough that it fits
|
||||
# into memory in the page server.
|
||||
from contextlib import closing
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.benchmark_fixture import MetricReport, ZenithBenchmarker
|
||||
|
||||
pytest_plugins = ("fixtures.zenith_fixtures", "fixtures.benchmark_fixture")
|
||||
|
||||
|
||||
def test_small_seqscans(zenith_simple_env: ZenithEnv, zenbenchmark: ZenithBenchmarker):
|
||||
env = zenith_simple_env
|
||||
# Create a branch for us
|
||||
env.zenith_cli(["branch", "test_small_seqscans", "empty"])
|
||||
|
||||
pg = env.postgres.create_start('test_small_seqscans')
|
||||
log.info("postgres is running on 'test_small_seqscans' branch")
|
||||
|
||||
# Open a connection directly to the page server that we'll use to force
|
||||
# flushing the layers to disk
|
||||
psconn = env.pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute('create table t (i integer);')
|
||||
cur.execute('insert into t values (generate_series(1,100000));')
|
||||
|
||||
# Verify that the table is larger than shared_buffers
|
||||
cur.execute('''
|
||||
select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('t') as tbl_ize
|
||||
from pg_settings where name = 'shared_buffers'
|
||||
''')
|
||||
row = cur.fetchone()
|
||||
log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
|
||||
assert int(row[0]) < int(row[1])
|
||||
|
||||
with zenbenchmark.record_duration('run'):
|
||||
for i in range(1000):
|
||||
cur.execute('select count(*) from t;')
|
||||
@@ -12,7 +12,6 @@
|
||||
# Amplification problem at its finest.
|
||||
import os
|
||||
from contextlib import closing
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.zenith_fixtures import ZenithEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
@@ -77,7 +76,4 @@ def test_write_amplification(zenith_simple_env: ZenithEnv, zenbenchmark):
|
||||
timeline_size = zenbenchmark.get_timeline_size(env.repo_dir,
|
||||
env.initial_tenant,
|
||||
timeline)
|
||||
zenbenchmark.record('size',
|
||||
timeline_size / (1024 * 1024),
|
||||
'MB',
|
||||
report=MetricReport.LOWER_IS_BETTER)
|
||||
zenbenchmark.record('size', timeline_size / (1024 * 1024), 'MB')
|
||||
|
||||
@@ -1,8 +1,4 @@
|
||||
[pytest]
|
||||
addopts =
|
||||
-m 'not remote_cluster'
|
||||
markers =
|
||||
remote_cluster
|
||||
minversion = 6.0
|
||||
log_format = %(asctime)s.%(msecs)-3d %(levelname)s [%(filename)s:%(lineno)d] %(message)s
|
||||
log_date_format = %Y-%m-%d %H:%M:%S
|
||||
@@ -13,8 +13,6 @@ column_limit = 100
|
||||
split_all_top_level_comma_separated_values = true
|
||||
|
||||
[mypy]
|
||||
# mypy uses regex
|
||||
exclude = ^vendor/
|
||||
# some tests don't typecheck when this flag is set
|
||||
check_untyped_defs = false
|
||||
|
||||
2
vendor/postgres
vendored
2
vendor/postgres
vendored
Submodule vendor/postgres updated: be8bdba074...a902dd8a4e
@@ -28,7 +28,6 @@ anyhow = "1.0"
|
||||
crc32c = "0.6.0"
|
||||
humantime = "2.1.0"
|
||||
walkdir = "2"
|
||||
signal-hook = "0.3.10"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
hex = "0.4.3"
|
||||
const_format = "0.2.21"
|
||||
@@ -39,6 +38,3 @@ postgres_ffi = { path = "../postgres_ffi" }
|
||||
workspace_hack = { path = "../workspace_hack" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2"
|
||||
|
||||
@@ -6,23 +6,23 @@ use clap::{App, Arg};
|
||||
use const_format::formatcp;
|
||||
use daemonize::Daemonize;
|
||||
use log::*;
|
||||
use std::env;
|
||||
use std::net::TcpListener;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::thread;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::logging;
|
||||
|
||||
use walkeeper::defaults::{DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_PG_LISTEN_ADDR};
|
||||
use walkeeper::http;
|
||||
use walkeeper::s3_offload;
|
||||
use walkeeper::wal_service;
|
||||
use walkeeper::SafeKeeperConf;
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::shutdown::exit_now;
|
||||
use zenith_utils::signals;
|
||||
use zenith_utils::{logging, tcp_listener, GIT_VERSION};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("safekeeper");
|
||||
let arg_matches = App::new("Zenith safekeeper")
|
||||
.about("Store WAL stream to local file system and push it to WAL receivers")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::with_name("datadir")
|
||||
.short("D")
|
||||
@@ -78,7 +78,20 @@ fn main() -> Result<()> {
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let mut conf: SafeKeeperConf = Default::default();
|
||||
let mut conf = SafeKeeperConf {
|
||||
// Always set to './'. We will chdir into the directory specified on the
|
||||
// command line, so that when the server is running, all paths are relative
|
||||
// to that.
|
||||
workdir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_pg_addr: DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
ttl: None,
|
||||
recall_period: None,
|
||||
pageserver_auth_token: env::var("PAGESERVER_AUTH_TOKEN").ok(),
|
||||
};
|
||||
|
||||
if let Some(dir) = arg_matches.value_of("datadir") {
|
||||
// change into the data directory.
|
||||
@@ -119,20 +132,17 @@ fn main() -> Result<()> {
|
||||
fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
let log_file = logging::init("safekeeper.log", conf.daemonize)?;
|
||||
|
||||
info!("version: {}", GIT_VERSION);
|
||||
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
let http_listener = TcpListener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
info!("Starting safekeeper on {}", conf.listen_pg_addr);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
let pg_listener = TcpListener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
// XXX: Don't spawn any threads before daemonizing!
|
||||
if conf.daemonize {
|
||||
info!("daemonizing...");
|
||||
|
||||
@@ -147,59 +157,51 @@ fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
|
||||
.stdout(stdout)
|
||||
.stderr(stderr);
|
||||
|
||||
// XXX: The parent process should exit abruptly right after
|
||||
// it has spawned a child to prevent coverage machinery from
|
||||
// dumping stats into a `profraw` file now owned by the child.
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
match daemonize.start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(e) => error!("Error, {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
let mut threads = vec![];
|
||||
let mut threads = Vec::new();
|
||||
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(|| {
|
||||
// TODO authentication
|
||||
let router = http::make_router(conf_);
|
||||
endpoint::serve_thread_main(router, http_listener).unwrap();
|
||||
})?,
|
||||
);
|
||||
let conf_cloned = conf.clone();
|
||||
let http_endpoint_thread = thread::Builder::new()
|
||||
.name("http_endpoint_thread".into())
|
||||
.spawn(|| {
|
||||
// TODO authentication
|
||||
let router = http::make_router(conf_cloned);
|
||||
endpoint::serve_thread_main(router, http_listener).unwrap();
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(http_endpoint_thread);
|
||||
|
||||
if conf.ttl.is_some() {
|
||||
let conf_ = conf.clone();
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("S3 offload thread".into())
|
||||
.spawn(|| {
|
||||
s3_offload::thread_main(conf_);
|
||||
})?,
|
||||
);
|
||||
let s3_conf = conf.clone();
|
||||
let s3_offload_thread = thread::Builder::new()
|
||||
.name("S3 offload thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
s3_offload::thread_main(s3_conf);
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(s3_offload_thread);
|
||||
}
|
||||
|
||||
threads.push(
|
||||
thread::Builder::new()
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
let thread_result = wal_service::thread_main(conf, pg_listener);
|
||||
if let Err(e) = thread_result {
|
||||
info!("wal_service thread terminated: {}", e);
|
||||
}
|
||||
})?,
|
||||
);
|
||||
let wal_acceptor_thread = thread::Builder::new()
|
||||
.name("WAL acceptor thread".into())
|
||||
.spawn(|| {
|
||||
// thread code
|
||||
let thread_result = wal_service::thread_main(conf, pg_listener);
|
||||
if let Err(e) = thread_result {
|
||||
info!("wal_service thread terminated: {}", e);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
threads.push(wal_acceptor_thread);
|
||||
|
||||
// NOTE: we still have to handle signals like SIGQUIT to prevent coredumps
|
||||
signals.handle(|signal| {
|
||||
// TODO: implement graceful shutdown with joining threads etc
|
||||
info!(
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
std::process::exit(111);
|
||||
})
|
||||
for t in threads {
|
||||
t.join().unwrap()
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -7,8 +7,7 @@ use std::fmt::Display;
|
||||
use std::sync::Arc;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::safekeeper::Term;
|
||||
use crate::safekeeper::TermHistory;
|
||||
use crate::safekeeper::AcceptorState;
|
||||
use crate::timeline::CreateControlFile;
|
||||
use crate::timeline::GlobalTimelines;
|
||||
use crate::SafeKeeperConf;
|
||||
@@ -30,7 +29,6 @@ fn get_conf(request: &Request<Body>) -> &SafeKeeperConf {
|
||||
.as_ref()
|
||||
}
|
||||
|
||||
/// Serialize through Display trait.
|
||||
fn display_serialize<S, F>(z: &F, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
@@ -39,14 +37,6 @@ where
|
||||
s.serialize_str(&format!("{}", z))
|
||||
}
|
||||
|
||||
/// Augment AcceptorState with epoch for convenience
|
||||
#[derive(Debug, Serialize)]
|
||||
struct AcceptorStateStatus {
|
||||
term: Term,
|
||||
epoch: Term,
|
||||
term_history: TermHistory,
|
||||
}
|
||||
|
||||
/// Info about timeline on safekeeper ready for reporting.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct TimelineStatus {
|
||||
@@ -54,7 +44,7 @@ struct TimelineStatus {
|
||||
tenant_id: ZTenantId,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
timeline_id: ZTimelineId,
|
||||
acceptor_state: AcceptorStateStatus,
|
||||
acceptor_state: AcceptorState,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
commit_lsn: Lsn,
|
||||
#[serde(serialize_with = "display_serialize")]
|
||||
@@ -76,18 +66,12 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
|
||||
)
|
||||
.map_err(ApiError::from_err)?;
|
||||
let sk_state = tli.get_info();
|
||||
let flush_lsn = tli.get_end_of_wal();
|
||||
|
||||
let acc_state = AcceptorStateStatus {
|
||||
term: sk_state.acceptor_state.term,
|
||||
epoch: sk_state.acceptor_state.get_epoch(flush_lsn),
|
||||
term_history: sk_state.acceptor_state.term_history,
|
||||
};
|
||||
let (flush_lsn, _) = tli.get_end_of_wal();
|
||||
|
||||
let status = TimelineStatus {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
acceptor_state: acc_state,
|
||||
acceptor_state: sk_state.acceptor_state,
|
||||
commit_lsn: sk_state.commit_lsn,
|
||||
truncate_lsn: sk_state.truncate_lsn,
|
||||
flush_lsn,
|
||||
|
||||
@@ -14,9 +14,9 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::safekeeper::{AcceptorProposerMessage, AppendResponse};
|
||||
use crate::safekeeper::{
|
||||
AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerElected, ProposerGreeting,
|
||||
AppendRequest, AppendRequestHeader, ProposerAcceptorMessage, ProposerGreeting,
|
||||
};
|
||||
use crate::safekeeper::{SafeKeeperState, Term, TermHistory, TermSwitchEntry};
|
||||
use crate::safekeeper::{SafeKeeperState, Term};
|
||||
use crate::send_wal::SendWalHandler;
|
||||
use crate::timeline::TimelineTools;
|
||||
use postgres_ffi::pg_constants;
|
||||
@@ -35,9 +35,6 @@ struct AppendLogicalMessage {
|
||||
// if true, commit_lsn will match flush_lsn after append
|
||||
set_commit_lsn: bool,
|
||||
|
||||
// if true, ProposerElected will be sent before append
|
||||
send_proposer_elected: bool,
|
||||
|
||||
// fields from AppendRequestHeader
|
||||
term: Term,
|
||||
epoch_start_lsn: Lsn,
|
||||
@@ -73,11 +70,6 @@ pub fn handle_json_ctrl(
|
||||
// need to init safekeeper state before AppendRequest
|
||||
prepare_safekeeper(swh)?;
|
||||
|
||||
// if send_proposer_elected is true, we need to update local history
|
||||
if append_request.send_proposer_elected {
|
||||
send_proposer_elected(swh, append_request.term, append_request.epoch_start_lsn)?;
|
||||
}
|
||||
|
||||
let inserted_wal = append_logical_message(swh, append_request)?;
|
||||
let response = AppendResult {
|
||||
state: swh.timeline.get().get_info(),
|
||||
@@ -112,29 +104,11 @@ fn prepare_safekeeper(swh: &mut SendWalHandler) -> Result<()> {
|
||||
|
||||
let response = swh.timeline.get().process_msg(&greeting_request)?;
|
||||
match response {
|
||||
Some(AcceptorProposerMessage::Greeting(_)) => Ok(()),
|
||||
AcceptorProposerMessage::Greeting(_) => Ok(()),
|
||||
_ => anyhow::bail!("not GreetingResponse"),
|
||||
}
|
||||
}
|
||||
|
||||
fn send_proposer_elected(swh: &mut SendWalHandler, term: Term, lsn: Lsn) -> Result<()> {
|
||||
// add new term to existing history
|
||||
let history = swh.timeline.get().get_info().acceptor_state.term_history;
|
||||
let history = history.up_to(lsn.checked_sub(1u64).unwrap());
|
||||
let mut history_entries = history.0;
|
||||
history_entries.push(TermSwitchEntry { term, lsn });
|
||||
let history = TermHistory(history_entries);
|
||||
|
||||
let proposer_elected_request = ProposerAcceptorMessage::Elected(ProposerElected {
|
||||
term,
|
||||
start_streaming_at: lsn,
|
||||
term_history: history,
|
||||
});
|
||||
|
||||
swh.timeline.get().process_msg(&proposer_elected_request)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct InsertedWAL {
|
||||
begin_lsn: Lsn,
|
||||
@@ -176,7 +150,7 @@ fn append_logical_message(
|
||||
let response = swh.timeline.get().process_msg(&append_request)?;
|
||||
|
||||
let append_response = match response {
|
||||
Some(AcceptorProposerMessage::AppendResponse(resp)) => resp,
|
||||
AcceptorProposerMessage::AppendResponse(resp) => resp,
|
||||
_ => anyhow::bail!("not AppendResponse"),
|
||||
};
|
||||
|
||||
|
||||
@@ -2,9 +2,6 @@
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
|
||||
use std::env;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
pub mod http;
|
||||
pub mod json_ctrl;
|
||||
pub mod receive_wal;
|
||||
@@ -45,28 +42,3 @@ pub struct SafeKeeperConf {
|
||||
pub ttl: Option<Duration>,
|
||||
pub recall_period: Option<Duration>,
|
||||
}
|
||||
|
||||
impl SafeKeeperConf {
|
||||
pub fn timeline_dir(&self, timelineid: &ZTimelineId) -> PathBuf {
|
||||
self.workdir.join(timelineid.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for SafeKeeperConf {
|
||||
fn default() -> Self {
|
||||
SafeKeeperConf {
|
||||
// Always set to './'. We will chdir into the directory specified on the
|
||||
// command line, so that when the server is running, all paths are relative
|
||||
// to that.
|
||||
workdir: PathBuf::from("./"),
|
||||
daemonize: false,
|
||||
no_sync: false,
|
||||
pageserver_addr: None,
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
ttl: None,
|
||||
recall_period: None,
|
||||
pageserver_auth_token: env::var("PAGESERVER_AUTH_TOKEN").ok(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
use postgres::{Client, Config, NoTls};
|
||||
|
||||
@@ -74,12 +73,12 @@ fn request_callback(conf: SafeKeeperConf, timelineid: ZTimelineId, tenantid: ZTe
|
||||
}
|
||||
|
||||
impl<'pg> ReceiveWalConn<'pg> {
|
||||
pub fn new(pg: &'pg mut PostgresBackend) -> ReceiveWalConn<'pg> {
|
||||
pub fn new(pg: &'pg mut PostgresBackend) -> Result<ReceiveWalConn<'pg>> {
|
||||
let peer_addr = *pg.get_peer_addr();
|
||||
ReceiveWalConn {
|
||||
Ok(ReceiveWalConn {
|
||||
pg_backend: pg,
|
||||
peer_addr,
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// Read and extract the bytes of a `CopyData` message from the postgres instance
|
||||
@@ -99,7 +98,7 @@ impl<'pg> ReceiveWalConn<'pg> {
|
||||
|
||||
// Send message to the postgres
|
||||
fn write_msg(&mut self, msg: &AcceptorProposerMessage) -> Result<()> {
|
||||
let mut buf = BytesMut::with_capacity(128);
|
||||
let mut buf = Vec::new();
|
||||
msg.serialize(&mut buf)?;
|
||||
self.pg_backend.write_message(&BeMessage::CopyData(&buf))?;
|
||||
Ok(())
|
||||
@@ -134,23 +133,14 @@ impl<'pg> ReceiveWalConn<'pg> {
|
||||
// Add far as replication in postgres is initiated by receiver, we should use callme mechanism
|
||||
let conf = swh.conf.clone();
|
||||
let timelineid = swh.timeline.get().timelineid;
|
||||
let _ = thread::Builder::new()
|
||||
.name("request_callback thread".into())
|
||||
.spawn(move || {
|
||||
request_callback(conf, timelineid, tenant_id);
|
||||
})
|
||||
.unwrap();
|
||||
thread::spawn(move || {
|
||||
request_callback(conf, timelineid, tenant_id);
|
||||
});
|
||||
}
|
||||
|
||||
loop {
|
||||
let reply = swh
|
||||
.timeline
|
||||
.get()
|
||||
.process_msg(&msg)
|
||||
.with_context(|| "failed to process ProposerAcceptorMessage")?;
|
||||
if let Some(reply) = reply {
|
||||
self.write_msg(&reply)?;
|
||||
}
|
||||
let reply = swh.timeline.get().process_msg(&msg)?;
|
||||
self.write_msg(&reply)?;
|
||||
msg = self.read_msg()?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,15 +6,12 @@ use crate::timeline::{ReplicaState, Timeline, TimelineTools};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use log::*;
|
||||
use postgres_ffi::xlog_utils::{
|
||||
get_current_timestamp, TimestampTz, XLogFileName, MAX_SEND_SIZE, PG_TLI,
|
||||
};
|
||||
use postgres_ffi::xlog_utils::{get_current_timestamp, TimestampTz, XLogFileName, MAX_SEND_SIZE};
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::min;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::net::Shutdown;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::thread::sleep;
|
||||
@@ -55,9 +52,9 @@ impl HotStandbyFeedback {
|
||||
/// Standby status update
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct StandbyReply {
|
||||
pub write_lsn: Lsn, // not used
|
||||
pub flush_lsn: Lsn, // not used
|
||||
pub apply_lsn: Lsn, // pageserver's disk consistent lSN
|
||||
pub write_lsn: Lsn, // disk consistent lSN
|
||||
pub flush_lsn: Lsn, // LSN committedby quorum
|
||||
pub apply_lsn: Lsn, // not used
|
||||
pub reply_ts: TimestampTz,
|
||||
pub reply_requested: bool,
|
||||
}
|
||||
@@ -91,7 +88,7 @@ impl ReplicationConn {
|
||||
|
||||
/// Handle incoming messages from the network.
|
||||
/// This is spawned into the background by `handle_start_replication`.
|
||||
fn background_thread(mut stream_in: ReadStream, timeline: Arc<Timeline>) -> Result<()> {
|
||||
fn background_thread(mut stream_in: impl Read, timeline: Arc<Timeline>) -> Result<()> {
|
||||
let mut state = ReplicaState::new();
|
||||
let replica = timeline.add_replica(state);
|
||||
let _guard = ReplicationConnGuard {
|
||||
@@ -115,19 +112,14 @@ impl ReplicationConn {
|
||||
Some(STANDBY_STATUS_UPDATE_TAG_BYTE) => {
|
||||
let reply = StandbyReply::des(&m[1..])
|
||||
.context("failed to deserialize StandbyReply")?;
|
||||
state.disk_consistent_lsn = reply.apply_lsn;
|
||||
state.disk_consistent_lsn = reply.write_lsn;
|
||||
timeline.update_replica_state(replica, Some(state));
|
||||
}
|
||||
_ => warn!("unexpected message {:?}", msg),
|
||||
}
|
||||
}
|
||||
FeMessage::Sync => {}
|
||||
FeMessage::CopyFail => {
|
||||
// Shutdown the connection, because rust-postgres client cannot be dropped
|
||||
// when connection is alive.
|
||||
let _ = stream_in.shutdown(Shutdown::Both);
|
||||
return Err(anyhow!("Copy failed"));
|
||||
}
|
||||
FeMessage::CopyFail => return Err(anyhow!("Copy failed")),
|
||||
_ => {
|
||||
// We only handle `CopyData`, 'Sync', 'CopyFail' messages. Anything else is ignored.
|
||||
info!("unexpected message {:?}", msg);
|
||||
@@ -138,16 +130,16 @@ impl ReplicationConn {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper function that parses a single LSN.
|
||||
fn parse_start(cmd: &[u8]) -> Result<Lsn> {
|
||||
/// Helper function that parses a pair of LSNs.
|
||||
fn parse_start_stop(cmd: &[u8]) -> Result<(Lsn, Lsn)> {
|
||||
let re = Regex::new(r"([[:xdigit:]]+/[[:xdigit:]]+)").unwrap();
|
||||
let caps = re.captures_iter(str::from_utf8(cmd)?);
|
||||
let mut lsns = caps.map(|cap| cap[1].parse::<Lsn>());
|
||||
let start_pos = lsns
|
||||
.next()
|
||||
.ok_or_else(|| anyhow!("Failed to parse start LSN from command"))??;
|
||||
assert!(lsns.next().is_none());
|
||||
Ok(start_pos)
|
||||
let stop_pos = lsns.next().transpose()?.unwrap_or(Lsn(0));
|
||||
Ok((start_pos, stop_pos))
|
||||
}
|
||||
|
||||
/// Helper function for opening a wal file.
|
||||
@@ -181,18 +173,13 @@ impl ReplicationConn {
|
||||
let bg_timeline = Arc::clone(swh.timeline.get());
|
||||
let bg_stream_in = self.stream_in.take().unwrap();
|
||||
|
||||
// TODO: here we got two threads, one for writing WAL and one for receiving
|
||||
// feedback. If one of them fails, we should shutdown the other one too.
|
||||
let _ = thread::Builder::new()
|
||||
.name("HotStandbyFeedback thread".into())
|
||||
.spawn(move || {
|
||||
if let Err(err) = Self::background_thread(bg_stream_in, bg_timeline) {
|
||||
error!("Replication background thread failed: {}", err);
|
||||
}
|
||||
})
|
||||
.unwrap();
|
||||
thread::spawn(move || {
|
||||
if let Err(err) = Self::background_thread(bg_stream_in, bg_timeline) {
|
||||
error!("Replication background thread failed: {}", err);
|
||||
}
|
||||
});
|
||||
|
||||
let mut start_pos = Self::parse_start(cmd)?;
|
||||
let (mut start_pos, mut stop_pos) = Self::parse_start_stop(cmd)?;
|
||||
|
||||
let mut wal_seg_size: usize;
|
||||
loop {
|
||||
@@ -204,22 +191,14 @@ impl ReplicationConn {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let wal_end = swh.timeline.get().get_end_of_wal();
|
||||
// Walproposer gets special handling: safekeeper must give proposer all
|
||||
// local WAL till the end, whether committed or not (walproposer will
|
||||
// hang otherwise). That's because walproposer runs the consensus and
|
||||
// synchronizes safekeepers on the most advanced one.
|
||||
//
|
||||
// There is a small risk of this WAL getting concurrently garbaged if
|
||||
// another compute rises which collects majority and starts fixing log
|
||||
// on this safekeeper itself. That's ok as (old) proposer will never be
|
||||
// able to commit such WAL.
|
||||
let stop_pos: Option<Lsn> = if swh.appname == Some("wal_proposer_recovery".to_string()) {
|
||||
Some(wal_end)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
info!("Start replication from {:?} till {:?}", start_pos, stop_pos);
|
||||
let (wal_end, timeline) = swh.timeline.get().get_end_of_wal();
|
||||
if start_pos == Lsn(0) {
|
||||
start_pos = wal_end;
|
||||
}
|
||||
if stop_pos == Lsn(0) && swh.appname == Some("wal_proposer_recovery".to_string()) {
|
||||
stop_pos = wal_end;
|
||||
}
|
||||
info!("Start replication from {} till {}", start_pos, stop_pos);
|
||||
|
||||
// switch to copy
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
@@ -228,14 +207,18 @@ impl ReplicationConn {
|
||||
let mut wal_file: Option<File> = None;
|
||||
|
||||
loop {
|
||||
if let Some(stop_pos) = stop_pos {
|
||||
/* Wait until we have some data to stream */
|
||||
if stop_pos != Lsn(0) {
|
||||
/* recovery mode: stream up to the specified LSN (VCL) */
|
||||
if start_pos >= stop_pos {
|
||||
break; /* recovery finished */
|
||||
/* recovery finished */
|
||||
break;
|
||||
}
|
||||
end_pos = stop_pos;
|
||||
} else {
|
||||
/* Wait until we have some data to stream */
|
||||
if let Some(lsn) = swh.timeline.get().wait_for_lsn(start_pos) {
|
||||
/* normal mode */
|
||||
let timeline = swh.timeline.get();
|
||||
if let Some(lsn) = timeline.wait_for_lsn(start_pos) {
|
||||
end_pos = lsn
|
||||
} else {
|
||||
// timeout expired: request pageserver status
|
||||
@@ -258,9 +241,9 @@ impl ReplicationConn {
|
||||
None => {
|
||||
// Open a new file.
|
||||
let segno = start_pos.segment_number(wal_seg_size);
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, wal_seg_size);
|
||||
let timeline_id = swh.timeline.get().timelineid;
|
||||
let wal_file_path = swh.conf.timeline_dir(&timeline_id).join(wal_file_name);
|
||||
let wal_file_name = XLogFileName(timeline, segno, wal_seg_size);
|
||||
let timeline_id = swh.timeline.get().timelineid.to_string();
|
||||
let wal_file_path = swh.conf.workdir.join(timeline_id).join(wal_file_name);
|
||||
Self::open_wal_file(&wal_file_path)?
|
||||
}
|
||||
};
|
||||
@@ -290,7 +273,7 @@ impl ReplicationConn {
|
||||
|
||||
start_pos += send_size as u64;
|
||||
|
||||
debug!("sent WAL up to {}", start_pos);
|
||||
debug!("sent WAL up to {}", end_pos);
|
||||
|
||||
// Decide whether to reuse this file. If we don't set wal_file here
|
||||
// a new file will be opened next time.
|
||||
|
||||
@@ -1,19 +1,18 @@
|
||||
//! Acceptor part of proposer-acceptor consensus algorithm.
|
||||
|
||||
use anyhow::Context;
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use byteorder::LittleEndian;
|
||||
use byteorder::ReadBytesExt;
|
||||
use byteorder::WriteBytesExt;
|
||||
use bytes::Buf;
|
||||
use bytes::BufMut;
|
||||
use bytes::Bytes;
|
||||
use bytes::BytesMut;
|
||||
use log::*;
|
||||
use pageserver::waldecoder::WalStreamDecoder;
|
||||
use postgres_ffi::xlog_utils::TimeLineID;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::max;
|
||||
use std::cmp::min;
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
use std::io::Read;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
@@ -37,70 +36,6 @@ const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
/// Consensus logical timestamp.
|
||||
pub type Term = u64;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct TermSwitchEntry {
|
||||
pub term: Term,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct TermHistory(pub Vec<TermSwitchEntry>);
|
||||
|
||||
impl TermHistory {
|
||||
pub fn empty() -> TermHistory {
|
||||
TermHistory(Vec::new())
|
||||
}
|
||||
|
||||
// Parse TermHistory as n_entries followed by TermSwitchEntry pairs
|
||||
pub fn from_bytes(mut bytes: Bytes) -> Result<TermHistory> {
|
||||
if bytes.remaining() < 4 {
|
||||
bail!("TermHistory misses len");
|
||||
}
|
||||
let n_entries = bytes.get_u32_le();
|
||||
let mut res = Vec::with_capacity(n_entries as usize);
|
||||
for _ in 0..n_entries {
|
||||
if bytes.remaining() < 16 {
|
||||
bail!("TermHistory is incomplete");
|
||||
}
|
||||
res.push(TermSwitchEntry {
|
||||
term: bytes.get_u64_le(),
|
||||
lsn: bytes.get_u64_le().into(),
|
||||
})
|
||||
}
|
||||
Ok(TermHistory(res))
|
||||
}
|
||||
|
||||
/// Return copy of self with switches happening strictly after up_to
|
||||
/// truncated.
|
||||
pub fn up_to(&self, up_to: Lsn) -> TermHistory {
|
||||
let mut res = Vec::with_capacity(self.0.len());
|
||||
for e in &self.0 {
|
||||
if e.lsn > up_to {
|
||||
break;
|
||||
}
|
||||
res.push(*e);
|
||||
}
|
||||
TermHistory(res)
|
||||
}
|
||||
}
|
||||
|
||||
/// Display only latest entries for Debug.
|
||||
impl fmt::Debug for TermHistory {
|
||||
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
|
||||
let n_printed = 20;
|
||||
write!(
|
||||
fmt,
|
||||
"{}{:?}",
|
||||
if self.0.len() > n_printed { "... " } else { "" },
|
||||
self.0
|
||||
.iter()
|
||||
.rev()
|
||||
.take(n_printed)
|
||||
.map(|&e| (e.term, e.lsn)) // omit TermSwitchEntry
|
||||
.collect::<Vec<_>>()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Unique id of proposer. Not needed for correctness, used for monitoring.
|
||||
type PgUuid = [u8; 16];
|
||||
|
||||
@@ -109,21 +44,8 @@ type PgUuid = [u8; 16];
|
||||
pub struct AcceptorState {
|
||||
/// acceptor's last term it voted for (advanced in 1 phase)
|
||||
pub term: Term,
|
||||
/// History of term switches for safekeeper's WAL.
|
||||
/// Actually it often goes *beyond* WAL contents as we adopt term history
|
||||
/// from the proposer before recovery.
|
||||
pub term_history: TermHistory,
|
||||
}
|
||||
|
||||
impl AcceptorState {
|
||||
/// acceptor's epoch is the term of the highest entry in the log
|
||||
pub fn get_epoch(&self, flush_lsn: Lsn) -> Term {
|
||||
let th = self.term_history.up_to(flush_lsn);
|
||||
match th.0.last() {
|
||||
Some(e) => e.term,
|
||||
None => 0,
|
||||
}
|
||||
}
|
||||
/// acceptor's epoch (advanced, i.e. bumped to 'term' when VCL is reached).
|
||||
pub epoch: Term,
|
||||
}
|
||||
|
||||
/// Information about Postgres. Safekeeper gets it once and then verifies
|
||||
@@ -136,6 +58,7 @@ pub struct ServerInfo {
|
||||
pub tenant_id: ZTenantId,
|
||||
/// Zenith timelineid
|
||||
pub ztli: ZTimelineId,
|
||||
pub tli: TimeLineID,
|
||||
pub wal_seg_size: u32,
|
||||
}
|
||||
|
||||
@@ -168,15 +91,13 @@ impl SafeKeeperState {
|
||||
SafeKeeperState {
|
||||
magic: SK_MAGIC,
|
||||
format_version: SK_FORMAT_VERSION,
|
||||
acceptor_state: AcceptorState {
|
||||
term: 0,
|
||||
term_history: TermHistory::empty(),
|
||||
},
|
||||
acceptor_state: AcceptorState { term: 0, epoch: 0 },
|
||||
server: ServerInfo {
|
||||
pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
|
||||
system_id: 0, /* Postgres system identifier */
|
||||
tenant_id: ZTenantId::from([0u8; 16]),
|
||||
ztli: ZTimelineId::from([0u8; 16]),
|
||||
tli: 0,
|
||||
wal_seg_size: 0,
|
||||
},
|
||||
proposer_uuid: [0; 16],
|
||||
@@ -227,28 +148,16 @@ pub struct VoteRequest {
|
||||
/// Vote itself, sent from safekeeper to proposer
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct VoteResponse {
|
||||
term: Term, // safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
term: Term, // not really needed, just a sanity check
|
||||
vote_given: u64, // fixme u64 due to padding
|
||||
// Safekeeper flush_lsn (end of WAL) + history of term switches allow
|
||||
// proposer to choose the most advanced one.
|
||||
/// Safekeeper's log position, to let proposer choose the most advanced one
|
||||
epoch: Term,
|
||||
flush_lsn: Lsn,
|
||||
truncate_lsn: Lsn,
|
||||
term_history: TermHistory,
|
||||
}
|
||||
|
||||
/*
|
||||
* Proposer -> Acceptor message announcing proposer is elected and communicating
|
||||
* term history to it.
|
||||
*/
|
||||
#[derive(Debug)]
|
||||
pub struct ProposerElected {
|
||||
pub term: Term,
|
||||
pub start_streaming_at: Lsn,
|
||||
pub term_history: TermHistory,
|
||||
}
|
||||
|
||||
/// Request with WAL message sent from proposer to safekeeper. Along the way it
|
||||
/// communicates commit_lsn.
|
||||
/// announces 1) successful election (with epoch_start_lsn); 2) commit_lsn.
|
||||
#[derive(Debug)]
|
||||
pub struct AppendRequest {
|
||||
pub h: AppendRequestHeader,
|
||||
@@ -256,7 +165,6 @@ pub struct AppendRequest {
|
||||
}
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
pub struct AppendRequestHeader {
|
||||
// safekeeper's current term; if it is higher than proposer's, the compute is out of date.
|
||||
pub term: Term,
|
||||
// LSN since the proposer appends WAL; determines epoch switch point.
|
||||
pub epoch_start_lsn: Lsn,
|
||||
@@ -278,6 +186,7 @@ pub struct AppendResponse {
|
||||
// Current term of the safekeeper; if it is higher than proposer's, the
|
||||
// compute is out of date.
|
||||
pub term: Term,
|
||||
pub epoch: Term,
|
||||
// NOTE: this is physical end of wal on safekeeper; currently it doesn't
|
||||
// make much sense without taking epoch into account, as history can be
|
||||
// diverged.
|
||||
@@ -290,32 +199,19 @@ pub struct AppendResponse {
|
||||
pub hs_feedback: HotStandbyFeedback,
|
||||
}
|
||||
|
||||
impl AppendResponse {
|
||||
fn term_only(term: Term) -> AppendResponse {
|
||||
AppendResponse {
|
||||
term,
|
||||
flush_lsn: Lsn(0),
|
||||
commit_lsn: Lsn(0),
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Proposer -> Acceptor messages
|
||||
#[derive(Debug)]
|
||||
pub enum ProposerAcceptorMessage {
|
||||
Greeting(ProposerGreeting),
|
||||
VoteRequest(VoteRequest),
|
||||
Elected(ProposerElected),
|
||||
AppendRequest(AppendRequest),
|
||||
}
|
||||
|
||||
impl ProposerAcceptorMessage {
|
||||
/// Parse proposer message.
|
||||
pub fn parse(msg_bytes: Bytes) -> Result<ProposerAcceptorMessage> {
|
||||
pub fn parse(msg: Bytes) -> Result<ProposerAcceptorMessage> {
|
||||
// xxx using Reader is inefficient but easy to work with bincode
|
||||
let mut stream = msg_bytes.reader();
|
||||
let mut stream = msg.reader();
|
||||
// u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
|
||||
let tag = stream.read_u64::<LittleEndian>()? as u8 as char;
|
||||
match tag {
|
||||
@@ -327,21 +223,6 @@ impl ProposerAcceptorMessage {
|
||||
let msg = VoteRequest::des_from(&mut stream)?;
|
||||
Ok(ProposerAcceptorMessage::VoteRequest(msg))
|
||||
}
|
||||
'e' => {
|
||||
let mut msg_bytes = stream.into_inner();
|
||||
if msg_bytes.remaining() < 16 {
|
||||
bail!("ProposerElected message is not complete");
|
||||
}
|
||||
let term = msg_bytes.get_u64_le();
|
||||
let start_streaming_at = msg_bytes.get_u64_le().into();
|
||||
let term_history = TermHistory::from_bytes(msg_bytes)?;
|
||||
let msg = ProposerElected {
|
||||
term,
|
||||
start_streaming_at,
|
||||
term_history,
|
||||
};
|
||||
Ok(ProposerAcceptorMessage::Elected(msg))
|
||||
}
|
||||
'a' => {
|
||||
// read header followed by wal data
|
||||
let hdr = AppendRequestHeader::des_from(&mut stream)?;
|
||||
@@ -379,33 +260,19 @@ pub enum AcceptorProposerMessage {
|
||||
|
||||
impl AcceptorProposerMessage {
|
||||
/// Serialize acceptor -> proposer message.
|
||||
pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
|
||||
pub fn serialize(&self, stream: &mut impl io::Write) -> Result<()> {
|
||||
match self {
|
||||
AcceptorProposerMessage::Greeting(msg) => {
|
||||
buf.put_u64_le('g' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
stream.write_u64::<LittleEndian>('g' as u64)?;
|
||||
msg.ser_into(stream)?;
|
||||
}
|
||||
AcceptorProposerMessage::VoteResponse(msg) => {
|
||||
buf.put_u64_le('v' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.vote_given);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.truncate_lsn.into());
|
||||
buf.put_u32_le(msg.term_history.0.len() as u32);
|
||||
for e in &msg.term_history.0 {
|
||||
buf.put_u64_le(e.term);
|
||||
buf.put_u64_le(e.lsn.into());
|
||||
}
|
||||
stream.write_u64::<LittleEndian>('v' as u64)?;
|
||||
msg.ser_into(stream)?;
|
||||
}
|
||||
AcceptorProposerMessage::AppendResponse(msg) => {
|
||||
buf.put_u64_le('a' as u64);
|
||||
buf.put_u64_le(msg.term);
|
||||
buf.put_u64_le(msg.flush_lsn.into());
|
||||
buf.put_u64_le(msg.commit_lsn.into());
|
||||
buf.put_u64_le(msg.disk_consistent_lsn.into());
|
||||
buf.put_i64_le(msg.hs_feedback.ts);
|
||||
buf.put_u64_le(msg.hs_feedback.xmin);
|
||||
buf.put_u64_le(msg.hs_feedback.catalog_xmin);
|
||||
stream.write_u64::<LittleEndian>('a' as u64)?;
|
||||
msg.ser_into(stream)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -418,8 +285,6 @@ pub trait Storage {
|
||||
fn persist(&mut self, s: &SafeKeeperState, sync: bool) -> Result<()>;
|
||||
/// Write piece of wal in buf to disk and sync it.
|
||||
fn write_wal(&mut self, server: &ServerInfo, startpos: Lsn, buf: &[u8]) -> Result<()>;
|
||||
// Truncate WAL at specified LSN
|
||||
fn truncate_wal(&mut self, s: &ServerInfo, endpos: Lsn) -> Result<()>;
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
@@ -487,13 +352,15 @@ pub struct SafeKeeper<ST: Storage> {
|
||||
/// Locally flushed part of WAL with full records (end_lsn of last record).
|
||||
/// Established by reading wal.
|
||||
pub flush_lsn: Lsn,
|
||||
pub tli: u32,
|
||||
// Cached metrics so we don't have to recompute labels on each update.
|
||||
metrics: SafeKeeperMetrics,
|
||||
/// not-yet-flushed pairs of same named fields in s.*
|
||||
pub commit_lsn: Lsn,
|
||||
pub truncate_lsn: Lsn,
|
||||
pub storage: ST,
|
||||
pub s: SafeKeeperState, // persistent part
|
||||
pub s: SafeKeeperState, // persistent part
|
||||
pub elected_proposer_term: Term, // for monitoring/debugging
|
||||
decoder: WalStreamDecoder,
|
||||
}
|
||||
|
||||
@@ -502,48 +369,36 @@ where
|
||||
ST: Storage,
|
||||
{
|
||||
// constructor
|
||||
pub fn new(flush_lsn: Lsn, storage: ST, state: SafeKeeperState) -> SafeKeeper<ST> {
|
||||
pub fn new(flush_lsn: Lsn, tli: u32, storage: ST, state: SafeKeeperState) -> SafeKeeper<ST> {
|
||||
SafeKeeper {
|
||||
flush_lsn,
|
||||
tli,
|
||||
metrics: SafeKeeperMetrics::new_noname(),
|
||||
commit_lsn: state.commit_lsn,
|
||||
truncate_lsn: state.truncate_lsn,
|
||||
storage,
|
||||
s: state,
|
||||
elected_proposer_term: 0,
|
||||
decoder: WalStreamDecoder::new(Lsn(0)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Get history of term switches for the available WAL
|
||||
fn get_term_history(&self) -> TermHistory {
|
||||
self.s.acceptor_state.term_history.up_to(self.flush_lsn)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
fn get_epoch(&self) -> Term {
|
||||
self.s.acceptor_state.get_epoch(self.flush_lsn)
|
||||
}
|
||||
|
||||
/// Process message from proposer and possibly form reply. Concurrent
|
||||
/// callers must exclude each other.
|
||||
pub fn process_msg(
|
||||
&mut self,
|
||||
msg: &ProposerAcceptorMessage,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
) -> Result<AcceptorProposerMessage> {
|
||||
match msg {
|
||||
ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg),
|
||||
ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg),
|
||||
ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg),
|
||||
ProposerAcceptorMessage::AppendRequest(msg) => self.handle_append_request(msg),
|
||||
}
|
||||
}
|
||||
|
||||
/// Handle initial message from proposer: check its sanity and send my
|
||||
/// current term.
|
||||
fn handle_greeting(
|
||||
&mut self,
|
||||
msg: &ProposerGreeting,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
fn handle_greeting(&mut self, msg: &ProposerGreeting) -> Result<AcceptorProposerMessage> {
|
||||
/* Check protocol compatibility */
|
||||
if msg.protocol_version != SK_PROTOCOL_VERSION {
|
||||
bail!(
|
||||
@@ -566,10 +421,9 @@ where
|
||||
self.s.server.system_id = msg.system_id;
|
||||
self.s.server.tenant_id = msg.tenant_id;
|
||||
self.s.server.ztli = msg.ztli;
|
||||
self.s.server.tli = msg.tli;
|
||||
self.s.server.wal_seg_size = msg.wal_seg_size;
|
||||
self.storage
|
||||
.persist(&self.s, true)
|
||||
.with_context(|| "failed to persist shared state")?;
|
||||
self.storage.persist(&self.s, true)?;
|
||||
|
||||
self.metrics = SafeKeeperMetrics::new(self.s.server.ztli);
|
||||
|
||||
@@ -577,106 +431,64 @@ where
|
||||
"processed greeting from proposer {:?}, sending term {:?}",
|
||||
msg.proposer_id, self.s.acceptor_state.term
|
||||
);
|
||||
Ok(Some(AcceptorProposerMessage::Greeting(AcceptorGreeting {
|
||||
Ok(AcceptorProposerMessage::Greeting(AcceptorGreeting {
|
||||
term: self.s.acceptor_state.term,
|
||||
})))
|
||||
}))
|
||||
}
|
||||
|
||||
/// Give vote for the given term, if we haven't done that previously.
|
||||
fn handle_vote_request(
|
||||
&mut self,
|
||||
msg: &VoteRequest,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
fn handle_vote_request(&mut self, msg: &VoteRequest) -> Result<AcceptorProposerMessage> {
|
||||
// initialize with refusal
|
||||
let mut resp = VoteResponse {
|
||||
term: self.s.acceptor_state.term,
|
||||
term: msg.term,
|
||||
vote_given: false as u64,
|
||||
flush_lsn: self.flush_lsn,
|
||||
truncate_lsn: self.s.truncate_lsn,
|
||||
term_history: self.get_term_history(),
|
||||
epoch: 0,
|
||||
flush_lsn: Lsn(0),
|
||||
truncate_lsn: Lsn(0),
|
||||
};
|
||||
if self.s.acceptor_state.term < msg.term {
|
||||
self.s.acceptor_state.term = msg.term;
|
||||
// persist vote before sending it out
|
||||
self.storage.persist(&self.s, true)?;
|
||||
resp.term = self.s.acceptor_state.term;
|
||||
resp.vote_given = true as u64;
|
||||
resp.epoch = self.s.acceptor_state.epoch;
|
||||
resp.flush_lsn = self.flush_lsn;
|
||||
resp.truncate_lsn = self.s.truncate_lsn;
|
||||
}
|
||||
info!("processed VoteRequest for term {}: {:?}", msg.term, &resp);
|
||||
Ok(Some(AcceptorProposerMessage::VoteResponse(resp)))
|
||||
}
|
||||
|
||||
/// Bump our term if received a note from elected proposer with higher one
|
||||
fn bump_if_higher(&mut self, term: Term) -> Result<()> {
|
||||
if self.s.acceptor_state.term < term {
|
||||
self.s.acceptor_state.term = term;
|
||||
self.storage.persist(&self.s, true)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Form AppendResponse from current state.
|
||||
fn append_response(&self) -> AppendResponse {
|
||||
AppendResponse {
|
||||
term: self.s.acceptor_state.term,
|
||||
flush_lsn: self.flush_lsn,
|
||||
commit_lsn: self.s.commit_lsn,
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
// will be filled by the upper code to avoid bothering safekeeper
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
}
|
||||
}
|
||||
|
||||
fn handle_elected(&mut self, msg: &ProposerElected) -> Result<Option<AcceptorProposerMessage>> {
|
||||
info!("received ProposerElected {:?}", msg);
|
||||
self.bump_if_higher(msg.term)?;
|
||||
// If our term is higher, ignore the message (next feedback will inform the compute)
|
||||
if self.s.acceptor_state.term > msg.term {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// TODO: cross check divergence point
|
||||
|
||||
// streaming must not create a hole
|
||||
assert!(self.flush_lsn == Lsn(0) || self.flush_lsn >= msg.start_streaming_at);
|
||||
|
||||
// truncate obsolete part of WAL
|
||||
if self.flush_lsn != Lsn(0) {
|
||||
self.storage
|
||||
.truncate_wal(&self.s.server, msg.start_streaming_at)?;
|
||||
}
|
||||
// update our end of WAL pointer
|
||||
self.flush_lsn = msg.start_streaming_at;
|
||||
// and now adopt term history from proposer
|
||||
self.s.acceptor_state.term_history = msg.term_history.clone();
|
||||
self.storage.persist(&self.s, true)?;
|
||||
|
||||
info!("start receiving WAL since {:?}", msg.start_streaming_at);
|
||||
|
||||
Ok(None)
|
||||
Ok(AcceptorProposerMessage::VoteResponse(resp))
|
||||
}
|
||||
|
||||
/// Handle request to append WAL.
|
||||
#[allow(clippy::comparison_chain)]
|
||||
fn handle_append_request(
|
||||
&mut self,
|
||||
msg: &AppendRequest,
|
||||
) -> Result<Option<AcceptorProposerMessage>> {
|
||||
fn handle_append_request(&mut self, msg: &AppendRequest) -> Result<AcceptorProposerMessage> {
|
||||
// log first AppendRequest from this proposer
|
||||
if self.elected_proposer_term < msg.h.term {
|
||||
info!(
|
||||
"start accepting WAL from timeline {}, tenant {}, term {}, epochStartLsn {:?}",
|
||||
self.s.server.ztli, self.s.server.tenant_id, msg.h.term, msg.h.epoch_start_lsn,
|
||||
);
|
||||
self.elected_proposer_term = msg.h.term;
|
||||
}
|
||||
|
||||
// If our term is lower than elected proposer one, bump it.
|
||||
if self.s.acceptor_state.term < msg.h.term {
|
||||
bail!("got AppendRequest before ProposerElected");
|
||||
self.s.acceptor_state.term = msg.h.term;
|
||||
self.storage.persist(&self.s, true)?;
|
||||
}
|
||||
|
||||
// If our term is higher, immediately refuse the message.
|
||||
if self.s.acceptor_state.term > msg.h.term {
|
||||
let resp = AppendResponse::term_only(self.s.acceptor_state.term);
|
||||
return Ok(Some(AcceptorProposerMessage::AppendResponse(resp)));
|
||||
// OTOH, if it is higher, immediately refuse the message.
|
||||
else if self.s.acceptor_state.term > msg.h.term {
|
||||
let resp = AppendResponse {
|
||||
term: self.s.acceptor_state.term,
|
||||
epoch: self.s.acceptor_state.epoch,
|
||||
commit_lsn: Lsn(0),
|
||||
flush_lsn: Lsn(0),
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
};
|
||||
return Ok(AcceptorProposerMessage::AppendResponse(resp));
|
||||
}
|
||||
|
||||
// After ProposerElected, which performs truncation, we should get only
|
||||
// indeed append requests (but flush_lsn is advanced only on record
|
||||
// boundary, so might be less).
|
||||
assert!(self.flush_lsn <= msg.h.begin_lsn);
|
||||
|
||||
self.s.proposer_uuid = msg.h.proposer_uuid;
|
||||
let mut sync_control_file = false;
|
||||
|
||||
@@ -720,21 +532,48 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Epoch switch happen when written WAL record cross the boundary.
|
||||
* The boundary is maximum of last WAL position at this node (FlushLSN) and global
|
||||
* maximum (vcl) determined by WAL proposer during handshake.
|
||||
* Switching epoch means that node completes recovery and start writing in the WAL new data.
|
||||
* XXX: this is wrong, we must actively truncate not matching part of log.
|
||||
*
|
||||
* The non-strict inequality is important for us, as proposer in --sync mode doesn't
|
||||
* generate new records, but to advance commit_lsn epoch switch must happen on majority.
|
||||
* We can regard this as commit of empty entry in new epoch, this should be safe.
|
||||
*/
|
||||
if self.s.acceptor_state.epoch < msg.h.term
|
||||
&& msg.h.end_lsn >= max(self.flush_lsn, msg.h.epoch_start_lsn)
|
||||
{
|
||||
info!(
|
||||
"switched to new epoch {} on receival of request end_lsn={:?}, len={:?}",
|
||||
msg.h.term,
|
||||
msg.h.end_lsn,
|
||||
msg.wal_data.len(),
|
||||
);
|
||||
self.s.acceptor_state.epoch = msg.h.term; /* bump epoch */
|
||||
sync_control_file = true;
|
||||
}
|
||||
if last_rec_lsn > self.flush_lsn {
|
||||
self.flush_lsn = last_rec_lsn;
|
||||
self.metrics.flush_lsn.set(u64::from(self.flush_lsn) as f64);
|
||||
}
|
||||
|
||||
// Advance commit_lsn taking into account what we have locally.
|
||||
// Advance commit_lsn taking into account what we have locally. xxx this
|
||||
// is wrapped into epoch check because we overwrite wal instead of
|
||||
// truncating it, so without it commit_lsn might include wrong part.
|
||||
// Anyway, nobody is much interested in our commit_lsn while epoch
|
||||
// switch hasn't happened, right?
|
||||
//
|
||||
// commit_lsn can be 0, being unknown to new walproposer while he hasn't
|
||||
// collected majority of its epoch acks yet, ignore it in this case.
|
||||
if msg.h.commit_lsn != Lsn(0) {
|
||||
if self.s.acceptor_state.epoch == msg.h.term && msg.h.commit_lsn != Lsn(0) {
|
||||
let commit_lsn = min(msg.h.commit_lsn, self.flush_lsn);
|
||||
// If new commit_lsn reached epoch switch, force sync of control
|
||||
// file: walproposer in sync mode is very interested when this
|
||||
// happens. Note: this is for sync-safekeepers mode only, as
|
||||
// otherwise commit_lsn might jump over epoch_start_lsn.
|
||||
sync_control_file |= commit_lsn == msg.h.epoch_start_lsn;
|
||||
// If new commit_lsn reached epoch switch, force sync of control file:
|
||||
// walproposer in sync mode is very interested when this happens.
|
||||
sync_control_file |=
|
||||
commit_lsn >= msg.h.epoch_start_lsn && self.s.commit_lsn < msg.h.epoch_start_lsn;
|
||||
self.commit_lsn = commit_lsn;
|
||||
self.metrics
|
||||
.commit_lsn
|
||||
@@ -755,7 +594,15 @@ where
|
||||
}
|
||||
self.storage.persist(&self.s, sync_control_file)?;
|
||||
|
||||
let resp = self.append_response();
|
||||
let resp = AppendResponse {
|
||||
term: self.s.acceptor_state.term,
|
||||
epoch: self.s.acceptor_state.epoch,
|
||||
flush_lsn: self.flush_lsn,
|
||||
commit_lsn: self.s.commit_lsn,
|
||||
disk_consistent_lsn: Lsn(0),
|
||||
// will be filled by caller code to avoid bothering safekeeper
|
||||
hs_feedback: HotStandbyFeedback::empty(),
|
||||
};
|
||||
info!(
|
||||
"processed AppendRequest of len {}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, resp {:?}",
|
||||
msg.wal_data.len(),
|
||||
@@ -764,7 +611,7 @@ where
|
||||
msg.h.truncate_lsn,
|
||||
&resp,
|
||||
);
|
||||
Ok(Some(AcceptorProposerMessage::AppendResponse(resp)))
|
||||
Ok(AcceptorProposerMessage::AppendResponse(resp))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -786,10 +633,6 @@ mod tests {
|
||||
fn write_wal(&mut self, _server: &ServerInfo, _startpos: Lsn, _buf: &[u8]) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn truncate_wal(&mut self, _server: &ServerInfo, _end_pos: Lsn) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -797,13 +640,13 @@ mod tests {
|
||||
let storage = InMemoryStorage {
|
||||
persisted_state: SafeKeeperState::new(),
|
||||
};
|
||||
let mut sk = SafeKeeper::new(Lsn(0), storage, SafeKeeperState::new());
|
||||
let mut sk = SafeKeeper::new(Lsn(0), 0, storage, SafeKeeperState::new());
|
||||
|
||||
// check voting for 1 is ok
|
||||
let vote_request = ProposerAcceptorMessage::VoteRequest(VoteRequest { term: 1 });
|
||||
let mut vote_resp = sk.process_msg(&vote_request);
|
||||
match vote_resp.unwrap() {
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given != 0),
|
||||
AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given != 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
}
|
||||
|
||||
@@ -812,12 +655,12 @@ mod tests {
|
||||
let storage = InMemoryStorage {
|
||||
persisted_state: state.clone(),
|
||||
};
|
||||
sk = SafeKeeper::new(Lsn(0), storage, state);
|
||||
sk = SafeKeeper::new(Lsn(0), 0, storage, state);
|
||||
|
||||
// and ensure voting second time for 1 is not ok
|
||||
vote_resp = sk.process_msg(&vote_request);
|
||||
match vote_resp.unwrap() {
|
||||
Some(AcceptorProposerMessage::VoteResponse(resp)) => assert!(resp.vote_given == 0),
|
||||
AcceptorProposerMessage::VoteResponse(resp) => assert!(resp.vote_given == 0),
|
||||
r => panic!("unexpected response: {:?}", r),
|
||||
}
|
||||
}
|
||||
@@ -827,7 +670,7 @@ mod tests {
|
||||
let storage = InMemoryStorage {
|
||||
persisted_state: SafeKeeperState::new(),
|
||||
};
|
||||
let mut sk = SafeKeeper::new(Lsn(0), storage, SafeKeeperState::new());
|
||||
let mut sk = SafeKeeper::new(Lsn(0), 0, storage, SafeKeeperState::new());
|
||||
|
||||
let mut ar_hdr = AppendRequestHeader {
|
||||
term: 1,
|
||||
@@ -843,21 +686,10 @@ mod tests {
|
||||
wal_data: Bytes::from_static(b"b"),
|
||||
};
|
||||
|
||||
let pem = ProposerElected {
|
||||
term: 1,
|
||||
start_streaming_at: Lsn(1),
|
||||
term_history: TermHistory(vec![TermSwitchEntry {
|
||||
term: 1,
|
||||
lsn: Lsn(3),
|
||||
}]),
|
||||
};
|
||||
sk.process_msg(&ProposerAcceptorMessage::Elected(pem))
|
||||
.unwrap();
|
||||
|
||||
// check that AppendRequest before epochStartLsn doesn't switch epoch
|
||||
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
|
||||
assert!(resp.is_ok());
|
||||
assert_eq!(sk.get_epoch(), 0);
|
||||
assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 0);
|
||||
|
||||
// but record at epochStartLsn does the switch
|
||||
ar_hdr.begin_lsn = Lsn(2);
|
||||
@@ -868,7 +700,6 @@ mod tests {
|
||||
};
|
||||
let resp = sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request));
|
||||
assert!(resp.is_ok());
|
||||
sk.flush_lsn = Lsn(3); // imitate the complete record at 3 %)
|
||||
assert_eq!(sk.get_epoch(), 1);
|
||||
assert_eq!(sk.storage.persisted_state.acceptor_state.epoch, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,9 +7,8 @@ use crate::receive_wal::ReceiveWalConn;
|
||||
use crate::replication::ReplicationConn;
|
||||
use crate::timeline::{Timeline, TimelineTools};
|
||||
use crate::SafeKeeperConf;
|
||||
use anyhow::{anyhow, bail, Context, Result};
|
||||
use anyhow::{anyhow, bail, Result};
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::xlog_utils::PG_TLI;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use zenith_utils::postgres_backend;
|
||||
@@ -75,9 +74,7 @@ impl postgres_backend::Handler for SendWalHandler {
|
||||
} else if query_string.starts_with(b"START_REPLICATION") {
|
||||
ReplicationConn::new(pgb).run(self, pgb, &query_string)?;
|
||||
} else if query_string.starts_with(b"START_WAL_PUSH") {
|
||||
ReceiveWalConn::new(pgb)
|
||||
.run(self)
|
||||
.with_context(|| "failed to run ReceiveWalConn")?;
|
||||
ReceiveWalConn::new(pgb)?.run(self)?;
|
||||
} else if query_string.starts_with(b"JSON_CTRL") {
|
||||
handle_json_ctrl(self, pgb, &query_string)?;
|
||||
} else {
|
||||
@@ -102,11 +99,11 @@ impl SendWalHandler {
|
||||
/// Handle IDENTIFY_SYSTEM replication command
|
||||
///
|
||||
fn handle_identify_system(&mut self, pgb: &mut PostgresBackend) -> Result<()> {
|
||||
let start_pos = self.timeline.get().get_end_of_wal();
|
||||
let (start_pos, timeline) = self.timeline.get().get_end_of_wal();
|
||||
let lsn = start_pos.to_string();
|
||||
let tli = timeline.to_string();
|
||||
let sysid = self.timeline.get().get_info().server.system_id.to_string();
|
||||
let lsn_bytes = lsn.as_bytes();
|
||||
let tli = PG_TLI.to_string();
|
||||
let tli_bytes = tli.as_bytes();
|
||||
let sysid_bytes = sysid.as_bytes();
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user