Compare commits

..

32 Commits

Author SHA1 Message Date
Bojan Serafimov
9ee5f17c41 Fix test 2022-06-29 17:40:48 -04:00
Bojan Serafimov
170c882c84 Check that tasks finish 2022-06-29 16:27:22 -04:00
Bojan Serafimov
a5bacd42f5 Merge branch 'main' into tenant-tasks-test 2022-06-29 14:52:33 -04:00
Bojan Serafimov
bfeaa4957f WIP 2022-06-29 14:24:22 -04:00
Bojan Serafimov
633b1762f6 Fix metric name 2022-06-29 11:39:15 -04:00
Bojan Serafimov
2b4c3cb932 Handle errors on shutdown 2022-06-28 09:29:59 -04:00
Bojan Serafimov
bf76f43ea4 Wait for tasks to complete 2022-06-27 17:54:31 -04:00
Bojan Serafimov
98062865f4 Handle errors inside loop 2022-06-24 17:08:44 -04:00
Bojan Serafimov
cdc81996b4 Add metrics 2022-06-24 15:28:53 -04:00
Bojan Serafimov
1169e9ea4c Rename threads to tasks 2022-06-24 11:08:55 -04:00
Bojan Serafimov
3a23869780 Log errors 2022-06-24 11:05:07 -04:00
Bojan Serafimov
796ee4d8af Instrument the task, not the await 2022-06-24 10:29:42 -04:00
Bojan Serafimov
b31ce411d2 Remove unnecessary map_err 2022-06-24 09:50:14 -04:00
Bojan Serafimov
24a5bd10a0 Add cancellation 2022-06-23 23:25:48 -04:00
Bojan Serafimov
2c029d9803 Remove redundant error context 2022-06-23 22:32:15 -04:00
Bojan Serafimov
763b00ccee Merge branch 'tenant-tasks' of github.com:neondatabase/neon into tenant-tasks 2022-06-23 14:15:14 -04:00
Bojan Serafimov
c44c8a0ea0 Add comment 2022-06-23 14:14:39 -04:00
Bojan Serafimov
692496d733 Cancel tasks 2022-06-23 13:20:53 -04:00
Bojan Serafimov
0f4552a544 Update TODO 2022-06-23 12:59:20 -04:00
Bojan Serafimov
d7d4cc8c77 Error instead of panic 2022-06-23 12:56:21 -04:00
Bojan Serafimov
9aab1d0f2b Expand blocking scope 2022-06-22 16:04:42 -04:00
Bojan Serafimov
83dc93ab0f Merge branch 'main' into tenant-tasks 2022-06-22 14:49:45 -04:00
Bojan Serafimov
9a9a58d52c Fmt 2022-06-15 09:49:07 -04:00
Bojan Serafimov
865e8740a7 Add docs 2022-06-15 09:35:41 -04:00
Bojan Serafimov
1a5d1a15d0 Use tokio sleep instead 2022-06-15 09:33:56 -04:00
Bojan Serafimov
02a9883f0f Add TenantTaskManager 2022-06-15 09:24:46 -04:00
Bojan Serafimov
ee36ca54d5 Run compaction as task 2022-06-15 09:16:57 -04:00
Bojan Serafimov
36cc6d2928 Fmt 2022-06-14 15:07:36 -04:00
Bojan Serafimov
e1a4c06918 Fix init 2022-06-14 15:05:30 -04:00
Bojan Serafimov
ec4528505e Simplify 2022-06-14 12:38:09 -04:00
Bojan Serafimov
c79e72e835 Add runtime 2022-06-14 11:08:17 -04:00
Bojan Serafimov
a1f85715ac WIP 2022-06-14 09:39:55 -04:00
157 changed files with 2899 additions and 6387 deletions

View File

@@ -1,13 +0,0 @@
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
#
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
#
[profile.dev.package."*"]
# Set the default for dependencies in Development mode.
opt-level = 3
[profile.dev]
# Turn on a small amount of optimization in Development mode.
opt-level = 1

View File

@@ -6,7 +6,5 @@ timeout = 30
[ssh_connection]
ssh_args = -F ./ansible.ssh.cfg
# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
# and scp neither worked for me
transfer_method = piped
scp_if_ssh = True
pipelining = True

View File

@@ -1,7 +1,3 @@
# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
# (use pre 8.5 option name to cope with old ssh in CI)
PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
Host tele.zenith.tech
User admin
Port 3023

View File

@@ -12,7 +12,6 @@ pageservers
safekeepers
[storage:vars]
env_name = neon-stress
console_mgmt_base_url = http://neon-stress-console.local
bucket_name = neon-storage-ireland
bucket_region = eu-west-1

View File

@@ -1,7 +1,6 @@
[pageservers]
#zenith-1-ps-1 console_region_id=1
zenith-1-ps-2 console_region_id=1
zenith-1-ps-3 console_region_id=1
[safekeepers]
zenith-1-sk-1 console_region_id=1
@@ -13,7 +12,6 @@ pageservers
safekeepers
[storage:vars]
env_name = prod-1
console_mgmt_base_url = http://console-release.local
bucket_name = zenith-storage-oregon
bucket_region = us-west-2

View File

@@ -13,7 +13,6 @@ pageservers
safekeepers
[storage:vars]
env_name = us-stage
console_mgmt_base_url = http://console-staging.local
bucket_name = zenith-staging-storage-us-east-1
bucket_region = us-east-1

View File

@@ -6,7 +6,7 @@ After=network.target auditd.service
Type=simple
User=safekeeper
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT

View File

@@ -5,10 +5,10 @@ executors:
resource_class: xlarge
docker:
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
- image: neondatabase/rust:1.58
- image: zimg/rust:1.58
neon-executor:
docker:
- image: neondatabase/rust:1.58
- image: zimg/rust:1.58
jobs:
# A job to build postgres
@@ -37,7 +37,7 @@ jobs:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
# Build postgres if the restore_cache didn't find a build.
# `make` can't figure out whether the cache is valid, since
@@ -54,7 +54,7 @@ jobs:
- save_cache:
name: Save postgres cache
key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
paths:
- tmp_install
@@ -85,7 +85,7 @@ jobs:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
- restore_cache:
name: Restore rust cache
@@ -93,7 +93,7 @@ jobs:
# Require an exact match. While an out of date cache might speed up the build,
# there's no way to clean out old packages, so the cache grows every time something
# changes.
- v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
- v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
# Build the rust code, including test binaries
- run:
@@ -107,7 +107,7 @@ jobs:
export CARGO_INCREMENTAL=0
export CACHEPOT_BUCKET=zenith-rust-cachepot
export RUSTC_WRAPPER=""
export RUSTC_WRAPPER=cachepot
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
@@ -115,7 +115,7 @@ jobs:
- save_cache:
name: Save rust cache
key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
paths:
- ~/.cargo/registry
- ~/.cargo/git
@@ -142,6 +142,11 @@ jobs:
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
test_exe_paths=$(
cargo test --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
mkdir -p /tmp/zenith/bin
mkdir -p /tmp/zenith/test_bin
mkdir -p /tmp/zenith/etc
@@ -325,6 +330,274 @@ jobs:
paths:
- "*"
# Build neondatabase/neon:latest image and push it to Docker hub
docker-image:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG=$(git log --oneline|wc -l)
docker build \
--pull \
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
docker push neondatabase/neon:${DOCKER_TAG}
docker push neondatabase/neon:latest
# Build neondatabase/compute-node:latest image and push it to Docker hub
docker-image-compute:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- run:
name: Build and push compute-tools Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
docker build \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/compute-tools:local \
--tag neondatabase/compute-tools:latest \
-f Dockerfile.compute-tools .
# Only push :latest image
docker push neondatabase/compute-tools:latest
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push compute-node Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG=$(git log --oneline|wc -l)
docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
--tag neondatabase/compute-node:latest vendor/postgres \
--build-arg COMPUTE_TOOLS_TAG=local
docker push neondatabase/compute-node:${DOCKER_TAG}
docker push neondatabase/compute-node:latest
# Build production neondatabase/neon:release image and push it to Docker hub
docker-image-release:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG="release-$(git log --oneline|wc -l)"
docker build \
--pull \
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
docker push neondatabase/neon:${DOCKER_TAG}
docker push neondatabase/neon:release
# Build production neondatabase/compute-node:release image and push it to Docker hub
docker-image-compute-release:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- run:
name: Build and push compute-tools Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
docker build \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/compute-tools:release \
--tag neondatabase/compute-tools:local \
-f Dockerfile.compute-tools .
# Only push :release image
docker push neondatabase/compute-tools:release
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push compute-node Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG="release-$(git log --oneline|wc -l)"
docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
--tag neondatabase/compute-node:release vendor/postgres \
--build-arg COMPUTE_TOOLS_TAG=local
docker push neondatabase/compute-node:${DOCKER_TAG}
docker push neondatabase/compute-node:release
deploy-staging:
docker:
- image: cimg/python:3.10
steps:
- checkout
- setup_remote_docker
- run:
name: Setup ansible
command: |
pip install --progress-bar off --user ansible boto3
- run:
name: Redeploy
command: |
cd "$(pwd)/.circleci/ansible"
./get_binaries.sh
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-playbook deploy.yaml -i staging.hosts
rm -f neon_install.tar.gz .neon_current_version
deploy-staging-proxy:
docker:
- image: cimg/base:2021.04
environment:
KUBECONFIG: .kubeconfig
steps:
- checkout
- run:
name: Store kubeconfig file
command: |
echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- run:
name: Setup helm v3
command: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add neondatabase https://neondatabase.github.io/helm-charts
- run:
name: Re-deploy proxy
command: |
DOCKER_TAG=$(git log --oneline|wc -l)
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
deploy-neon-stress:
docker:
- image: cimg/python:3.10
steps:
- checkout
- setup_remote_docker
- run:
name: Setup ansible
command: |
pip install --progress-bar off --user ansible boto3
- run:
name: Redeploy
command: |
cd "$(pwd)/.circleci/ansible"
./get_binaries.sh
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-playbook deploy.yaml -i neon-stress.hosts
rm -f neon_install.tar.gz .neon_current_version
deploy-neon-stress-proxy:
docker:
- image: cimg/base:2021.04
environment:
KUBECONFIG: .kubeconfig
steps:
- checkout
- run:
name: Store kubeconfig file
command: |
echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- run:
name: Setup helm v3
command: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add neondatabase https://neondatabase.github.io/helm-charts
- run:
name: Re-deploy proxy
command: |
DOCKER_TAG=$(git log --oneline|wc -l)
helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
deploy-release:
docker:
- image: cimg/python:3.10
steps:
- checkout
- setup_remote_docker
- run:
name: Setup ansible
command: |
pip install --progress-bar off --user ansible boto3
- run:
name: Redeploy
command: |
cd "$(pwd)/.circleci/ansible"
RELEASE=true ./get_binaries.sh
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-playbook deploy.yaml -i production.hosts
rm -f neon_install.tar.gz .neon_current_version
deploy-release-proxy:
docker:
- image: cimg/base:2021.04
environment:
KUBECONFIG: .kubeconfig
steps:
- checkout
- run:
name: Store kubeconfig file
command: |
echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- run:
name: Setup helm v3
command: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add neondatabase https://neondatabase.github.io/helm-charts
- run:
name: Re-deploy proxy
command: |
DOCKER_TAG="release-$(git log --oneline|wc -l)"
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
workflows:
build_and_test:
jobs:
@@ -367,3 +640,103 @@ workflows:
save_perf_report: true
requires:
- build-neon-release
- docker-image:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- main
requires:
- pg_regress-tests-release
- other-tests-release
- docker-image-compute:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- main
requires:
- pg_regress-tests-release
- other-tests-release
- deploy-staging:
# Context gives an ability to login
context: Docker Hub
# deploy only for commits to main
filters:
branches:
only:
- main
requires:
- docker-image
- deploy-staging-proxy:
# deploy only for commits to main
filters:
branches:
only:
- main
requires:
- docker-image
- deploy-neon-stress:
# Context gives an ability to login
context: Docker Hub
# deploy only for commits to main
filters:
branches:
only:
- main
requires:
- docker-image
- deploy-neon-stress-proxy:
# deploy only for commits to main
filters:
branches:
only:
- main
requires:
- docker-image
- docker-image-release:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- release
requires:
- pg_regress-tests-release
- other-tests-release
- docker-image-compute-release:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- release
requires:
- pg_regress-tests-release
- other-tests-release
- deploy-release:
# Context gives an ability to login
context: Docker Hub
# deploy only for commits to main
filters:
branches:
only:
- release
requires:
- docker-image-release
- deploy-release-proxy:
# deploy only for commits to main
filters:
branches:
only:
- release
requires:
- docker-image-release

View File

@@ -31,13 +31,6 @@ inputs:
runs:
using: "composite"
steps:
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Get Neon artifact for restoration
uses: actions/download-artifact@v3
with:
@@ -48,14 +41,15 @@ runs:
shell: bash -ex {0}
run: |
mkdir -p /tmp/neon/
tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/
tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
rm -rf ./neon-artifact/
# Restore the parts of the 'build' directory that were included in the
# tarball. This includes the regression test modules in
# src/test/regress/*.so.
mkdir -p build/
cp -a /tmp/neon/pg_build/* build/
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Cache poetry deps
id: cache_poetry
@@ -91,7 +85,7 @@ runs:
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
fi
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
if [[ "$GITHUB_REF" == "main" ]]; then
mkdir -p "$PERF_REPORT_DIR"
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
fi
@@ -121,7 +115,7 @@ runs:
-rA $TEST_SELECTION $EXTRA_PARAMS
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
if [[ "$GITHUB_REF" == "main" ]]; then
export REPORT_FROM="$PERF_REPORT_DIR"
export REPORT_TO=local
scripts/generate_and_push_perf_report.sh

View File

@@ -11,7 +11,7 @@ on:
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '36 4 * * *' # run once a day, timezone is utc
- cron: '36 7 * * *' # run once a day, timezone is utc
workflow_dispatch: # adds ability to run this manually
@@ -26,11 +26,11 @@ jobs:
runs-on: [self-hosted, zenith-benchmarker]
env:
POSTGRES_DISTRIB_DIR: "/usr/pgsql-14"
POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
steps:
- name: Checkout zenith repo
uses: actions/checkout@v3
uses: actions/checkout@v2
# actions/setup-python@v2 is not working correctly on self-hosted runners
# see https://github.com/actions/setup-python/issues/162
@@ -88,7 +88,7 @@ jobs:
# Plus time needed to initialize the test databases.
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
PLATFORM: "neon-staging"
PLATFORM: "zenith-staging"
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
run: |
@@ -96,7 +96,7 @@ jobs:
# since it might generate duplicates when calling ingest_perf_test_result.py
rm -rf perf-report-staging
mkdir -p perf-report-staging
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
- name: Submit result
env:
@@ -104,12 +104,3 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
run: |
REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -1,10 +1,9 @@
name: Test and Deploy
name: Test
on:
push:
branches:
- main
- release
pull_request:
defaults:
@@ -12,17 +11,61 @@ defaults:
shell: bash -ex {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
jobs:
build-postgres:
runs-on: [ self-hosted, Linux, k8s-runner ]
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
rust_toolchain: [ 1.58 ]
env:
BUILD_TYPE: ${{ matrix.build_type }}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Set pg revision for caching
id: pg_ver
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
- name: Cache postgres build
id: cache_pg
uses: actions/cache@v3
with:
path: tmp_install/
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: mold -run make postgres -j$(nproc)
# actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
- name: Prepare postgres artifact
run: tar -C tmp_install/ -czf ./pg.tgz .
- name: Upload postgres artifact
uses: actions/upload-artifact@v3
with:
retention-days: 7
if-no-files-found: error
name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
path: ./pg.tgz
build-neon:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ build-postgres ]
strategy:
fail-fast: false
matrix:
@@ -39,80 +82,80 @@ jobs:
submodules: true
fetch-depth: 1
- name: Set pg revision for caching
id: pg_ver
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
# Set some environment variables used by all the steps.
#
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
# It also includes --features, if any
#
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
# because "cargo metadata" doesn't accept --release or --debug options
#
- name: Set env variables
- name: Get postgres artifact for restoration
uses: actions/download-artifact@v3
with:
name: postgres-${{ runner.os }}-${{ matrix.build_type }}-artifact
path: ./postgres-artifact/
- name: Extract postgres artifact
run: |
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FEATURES=""
CARGO_FLAGS=""
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FEATURES="--features profiling"
CARGO_FLAGS="--release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
mkdir ./tmp_install/
tar -xf ./postgres-artifact/pg.tgz -C ./tmp_install/
rm -rf ./postgres-artifact/
# Don't include the ~/.cargo/registry/src directory. It contains just
# uncompressed versions of the crates in ~/.cargo/registry/cache
# directory, and it's faster to let 'cargo' to rebuild it from the
# compressed crates.
- name: Cache cargo deps
id: cache_cargo
uses: actions/cache@v3
with:
path: |
~/.cargo/registry/
!~/.cargo/registry/src
~/.cargo/git/
target/
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
key: |
v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
- name: Cache postgres build
id: cache_pg
uses: actions/cache@v3
with:
path: |
tmp_install/
build/src/test/regress/*.so
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: mold -run make postgres -j$(nproc)
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
- name: Run cargo build
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
CARGO_FLAGS=
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=()
CARGO_FLAGS="--release --features profiling"
fi
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
- name: Run cargo test
run: |
${cov_prefix} cargo test $CARGO_FLAGS
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
CARGO_FLAGS=
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=()
CARGO_FLAGS=--release
fi
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
- name: Install rust binaries
run: |
# Install target binaries
mkdir -p /tmp/neon/bin/
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=()
fi
binaries=$(
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
test_exe_paths=$(
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
mkdir -p /tmp/neon/bin/
mkdir -p /tmp/neon/test_bin/
mkdir -p /tmp/neon/etc/
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
# Install target binaries
for bin in $binaries; do
SRC=target/$BUILD_TYPE/$bin
DST=/tmp/neon/bin/$bin
@@ -121,39 +164,22 @@ jobs:
# Install test executables and write list of all binaries (for code coverage)
if [[ $BUILD_TYPE == "debug" ]]; then
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
mkdir -p /tmp/neon/test_bin/
test_exe_paths=$(
${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
for bin in $binaries; do
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
for bin in $test_exe_paths; do
SRC=$bin
DST=/tmp/neon/test_bin/$(basename $bin)
# We don't need debug symbols for code coverage, so strip them out to make
# the artifact smaller.
strip "$SRC" -o "$DST"
cp "$SRC" "$DST"
echo "$DST" >> /tmp/coverage/binaries.list
done
for bin in $binaries; do
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
fi
- name: Install postgres binaries
run: |
cp -a tmp_install /tmp/neon/pg_install
# Include modules needed by the Postgres regression tests
mkdir -p /tmp/neon/pg_build/src/test/regress
cp -a build/src/test/regress/*.so /tmp/neon/pg_build/src/test/regress
run: cp -a tmp_install /tmp/neon/pg_install
- name: Prepare neon artifact
run: ZSTD_NBTHREADS=0 tar -C /tmp/neon/ -cf ./neon.tar.zst --zstd .
run: tar -C /tmp/neon/ -czf ./neon.tgz .
- name: Upload neon binaries
uses: actions/upload-artifact@v3
@@ -161,7 +187,7 @@ jobs:
retention-days: 7
if-no-files-found: error
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
path: ./neon.tar.zst
path: ./neon.tgz
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
- name: Merge and upload coverage data
@@ -245,9 +271,6 @@ jobs:
test_selection: performance
run_in_parallel: false
save_perf_report: true
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
@@ -272,10 +295,9 @@ jobs:
with:
path: |
~/.cargo/registry/
!~/.cargo/registry/src
~/.cargo/git/
target/
key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact for restoration
uses: actions/download-artifact@v3
@@ -286,7 +308,7 @@ jobs:
- name: Extract Neon artifact
run: |
mkdir -p /tmp/neon/
tar -xf ./neon-artifact/neon.tar.zst -C /tmp/neon/
tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
rm -rf ./neon-artifact/
- name: Restore coverage data
@@ -365,253 +387,3 @@ jobs:
\"remote_repo\": \"${{ github.repository }}\"
}
}"
docker-image:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ pg_regress-tests, other-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
with:
driver: docker
- name: Get build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::$(git rev-list --count HEAD)"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: build-tag
- name: Get legacy build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::latest"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: legacy-build-tag
- name: Build neon Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
pull: true
push: true
tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
docker-image-compute:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ pg_regress-tests, other-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
with:
driver: docker
- name: Get build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::$(git rev-list --count HEAD)"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: build-tag
- name: Get legacy build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::latest"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: legacy-build-tag
- name: Build compute-tools Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
push: false
file: Dockerfile.compute-tools
tags: neondatabase/compute-tools:local
- name: Push compute-tools Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
push: true
file: Dockerfile.compute-tools
tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
- name: Build compute-node Docker image
uses: docker/build-push-action@v2
with:
context: ./vendor/postgres/
build-args:
COMPUTE_TOOLS_TAG=local
push: true
tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
calculate-deploy-targets:
runs-on: [ self-hosted, Linux, k8s-runner ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
matrix-include: ${{ steps.set-matrix.outputs.include }}
steps:
- id: set-matrix
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
echo "::set-output name=include::[$STAGING]"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
echo "::set-output name=include::[$PRODUCTION]"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
deploy:
runs-on: [ self-hosted, Linux, k8s-runner ]
# We need both storage **and** compute images for deploy, because control plane
# picks the compute version based on the storage version. If it notices a fresh
# storage it may bump the compute version. And if compute image failed to build
# it may break things badly.
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Setup ansible
run: |
pip install --progress-bar off --user ansible boto3
- name: Redeploy
run: |
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
RELEASE=true ./get_binaries.sh
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
eval $(ssh-agent)
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
runs-on: [ self-hosted, Linux, k8s-runner ]
# Compute image isn't strictly required for proxy deploy, but let's still wait for it
# to run all deploy jobs consistently.
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
env:
KUBECONFIG: .kubeconfig
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Store kubeconfig file
run: |
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- name: Setup helm v3
run: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add neondatabase https://neondatabase.github.io/helm-charts
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

View File

@@ -11,9 +11,8 @@ defaults:
shell: bash -ex {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
@@ -98,10 +97,9 @@ jobs:
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
- name: Run cargo clippy
run: ./run_clippy.sh

View File

@@ -1,72 +0,0 @@
name: Test Postgres client libraries
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '23 02 * * *' # run once a day, timezone is utc
workflow_dispatch:
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
jobs:
test-postgres-client-libs:
runs-on: [ ubuntu-latest ]
steps:
- name: Checkout
uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
shell: bash -ex {0}
run: ./scripts/pysync
- name: Run pytest
env:
REMOTE_ENV: 1
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
TEST_OUTPUT: /tmp/test_output
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
shell: bash -ex {0}
run: |
# Test framework expects we have psql binary;
# but since we don't really need it in this test, let's mock it
mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--tb=short \
--verbose \
-m "remote_cluster" \
-rA "test_runner/pg_clients"
- name: Post to a Slack channel
if: failure()
id: slack
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

788
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,3 @@
ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
# Where to install Postgres, default is ./tmp_install, maybe useful for package managers
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install
# Seccomp BPF is only available for Linux
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
@@ -46,73 +41,69 @@ CARGO_CMD_PREFIX += $(if $(filter n,$(MAKEFLAGS)),,+)
CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
#
# Top level Makefile to build Neon and PostgreSQL
# Top level Makefile to build Zenith and PostgreSQL
#
.PHONY: all
all: neon postgres
all: zenith postgres
### Neon Rust bits
### Zenith Rust bits
#
# The 'postgres_ffi' depends on the Postgres headers.
.PHONY: neon
neon: postgres-headers
+@echo "Compiling Neon"
.PHONY: zenith
zenith: postgres-headers
+@echo "Compiling Zenith"
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
### PostgreSQL parts
#
# Postgres is built in the 'build' directory, and installed into
# $(POSTGRES_INSTALL_DIR), which defaults to 'tmp_install'
#
build/config.status:
tmp_install/build/config.status:
+@echo "Configuring postgres build"
mkdir -p build
(cd build && \
$(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
mkdir -p tmp_install/build
(cd tmp_install/build && \
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
$(SECCOMP) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log)
--prefix=$(abspath tmp_install) > configure.log)
# nicer alias for running 'configure'
.PHONY: postgres-configure
postgres-configure: build/config.status
postgres-configure: tmp_install/build/config.status
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include
# Install the PostgreSQL header files into tmp_install/include
.PHONY: postgres-headers
postgres-headers: postgres-configure
+@echo "Installing PostgreSQL headers"
$(MAKE) -C build/src/include MAKELEVEL=0 install
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
# Compile and install PostgreSQL and contrib/neon
.PHONY: postgres
postgres: postgres-configure \
postgres-headers # to prevent `make install` conflicts with neon's `postgres-headers`
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
+@echo "Compiling PostgreSQL"
$(MAKE) -C build MAKELEVEL=0 install
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
+@echo "Compiling contrib/neon"
$(MAKE) -C build/contrib/neon install
$(MAKE) -C tmp_install/build/contrib/neon install
+@echo "Compiling contrib/neon_test_utils"
$(MAKE) -C build/contrib/neon_test_utils install
$(MAKE) -C tmp_install/build/contrib/neon_test_utils install
+@echo "Compiling pg_buffercache"
$(MAKE) -C build/contrib/pg_buffercache install
$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
+@echo "Compiling pageinspect"
$(MAKE) -C build/contrib/pageinspect install
$(MAKE) -C tmp_install/build/contrib/pageinspect install
.PHONY: postgres-clean
postgres-clean:
$(MAKE) -C build MAKELEVEL=0 clean
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean:
cd build && $(MAKE) clean
cd tmp_install/build && $(MAKE) clean
$(CARGO_CMD_PREFIX) cargo clean
# This removes everything
.PHONY: distclean
distclean:
rm -rf build $(POSTGRES_INSTALL_DIR)
rm -rf tmp_install
$(CARGO_CMD_PREFIX) cargo clean
.PHONY: fmt
@@ -121,4 +112,4 @@ fmt:
.PHONY: setup-pre-commit-hook
setup-pre-commit-hook:
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
ln -s -f ../../pre-commit.py .git/hooks/pre-commit

View File

@@ -18,5 +18,5 @@ serde_json = "1"
tar = "0.4"
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
url = "2.2.2"
urlencoding = "2.1.0"
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -33,7 +33,7 @@ use std::process::exit;
use std::sync::{Arc, RwLock};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
use anyhow::Result;
use chrono::Utc;
use clap::Arg;
use log::{error, info};
@@ -45,7 +45,6 @@ use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::pg_helpers::*;
use compute_tools::spec::*;
use url::Url;
fn main() -> Result<()> {
// TODO: re-use `utils::logging` later
@@ -132,7 +131,7 @@ fn main() -> Result<()> {
let compute_state = ComputeNode {
start_time: Utc::now(),
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
connstr: connstr.to_string(),
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
spec,

View File

@@ -1,3 +1,5 @@
use std::sync::Arc;
use anyhow::{anyhow, Result};
use log::error;
use postgres::Client;
@@ -21,8 +23,9 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
Ok(())
}
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
let connstr = &compute.connstr;
let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
if client.is_closed() {
return Err(anyhow!("connection to postgres closed"));
}

View File

@@ -35,8 +35,7 @@ use crate::spec::*;
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
pub start_time: DateTime<Utc>,
// Url type maintains proper escaping
pub connstr: url::Url,
pub connstr: String,
pub pgdata: String,
pub pgbin: String,
pub spec: ComputeSpec,
@@ -269,25 +268,21 @@ impl ComputeNode {
// In this case we need to connect with old `zenith_admin`name
// and create new user. We cannot simply rename connected user,
// but we can create a new one and grant it all privileges.
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
let mut client = match Client::connect(&self.connstr, NoTls) {
Err(e) => {
info!(
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
e
);
let mut zenith_admin_connstr = self.connstr.clone();
let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1);
zenith_admin_connstr
.set_username("zenith_admin")
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
let mut client = Client::connect(&zenith_admin_connstr, NoTls)?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client);
// reconnect with connsting with expected name
Client::connect(self.connstr.as_str(), NoTls)?
Client::connect(&self.connstr, NoTls)?
}
Ok(client) => client,
};
@@ -295,7 +290,7 @@ impl ComputeNode {
handle_roles(&self.spec, &mut client)?;
handle_databases(&self.spec, &mut client)?;
handle_role_deletions(self, &mut client)?;
handle_grants(self, &mut client)?;
handle_grants(&self.spec, &mut client)?;
create_writablity_check_data(&mut client)?;
// 'Close' connection

View File

@@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
// Spin in a loop and figure out the last activity time in the Postgres.
// Then update it in the shared state. This function never errors out.
// XXX: the only expected panic is at `RwLock` unwrap().
fn watch_compute_activity(compute: &ComputeNode) {
fn watch_compute_activity(compute: &Arc<ComputeNode>) {
// Suppose that `connstr` doesn't change
let connstr = compute.connstr.as_str();
let connstr = compute.connstr.clone();
// Define `client` outside of the loop to reuse existing connection if it's active.
let mut client = Client::connect(connstr, NoTls);
let mut client = Client::connect(&connstr, NoTls);
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
info!("watching Postgres activity at {}", connstr);
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
info!("connection to postgres closed, trying to reconnect");
// Connection is closed, reconnect and try again.
client = Client::connect(connstr, NoTls);
client = Client::connect(&connstr, NoTls);
continue;
}
@@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
debug!("cannot connect to postgres: {}, retrying", e);
// Establish a new connection and try again.
client = Client::connect(connstr, NoTls);
client = Client::connect(&connstr, NoTls);
}
}
}

View File

@@ -1,4 +1,3 @@
use std::fmt::Write;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::net::{SocketAddr, TcpStream};
@@ -139,11 +138,9 @@ impl Role {
// Now we also support SCRAM-SHA-256 and to preserve compatibility
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
if pass.starts_with("SCRAM-SHA-256") {
write!(params, " PASSWORD '{pass}'")
.expect("String is documented to not to error during write operations");
params.push_str(&format!(" PASSWORD '{}'", pass));
} else {
write!(params, " PASSWORD 'md5{pass}'")
.expect("String is documented to not to error during write operations");
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
}
} else {
params.push_str(" PASSWORD NULL");
@@ -161,8 +158,7 @@ impl Database {
/// it may require a proper quoting too.
pub fn to_pg_options(&self) -> String {
let mut params: String = self.options.as_pg_options();
write!(params, " OWNER {}", &self.owner.quote())
.expect("String is documented to not to error during write operations");
params.push_str(&format!(" OWNER {}", &self.owner.quote()));
params
}
@@ -248,20 +244,18 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
bail!("Postgres exited unexpectedly with code {}", code);
}
// Check that we can open pid file first.
if let Ok(file) = File::open(&pid_path) {
let file = BufReader::new(file);
let last_line = file.lines().last();
if pid_path.exists() {
let file = BufReader::new(File::open(&pid_path)?);
let status = file
.lines()
.last()
.unwrap()
.unwrap_or_else(|_| "unknown".to_string());
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
// Pid file could be there and we could read it, but it could be empty, for example.
if let Some(Ok(line)) = last_line {
let status = line.trim();
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
// Now Postgres is ready to accept connections
if status == "ready" && can_connect {
break;
}
// Now Postgres is ready to accept connections
if status.trim() == "ready" && can_connect {
break;
}
}

View File

@@ -4,6 +4,7 @@ use anyhow::Result;
use log::{info, log_enabled, warn, Level};
use postgres::{Client, NoTls};
use serde::Deserialize;
use urlencoding::encode;
use crate::compute::ComputeNode;
use crate::config;
@@ -230,11 +231,9 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
for db in &node.spec.cluster.databases {
if db.owner != *role_name {
let mut connstr = node.connstr.clone();
// database name is always the last and the only component of the path
connstr.set_path(&db.name);
let mut client = Client::connect(connstr.as_str(), NoTls)?;
let db_name_encoded = format!("/{}", encode(&db.name));
let db_connstr = node.connstr.replacen("/postgres", &db_name_encoded, 1);
let mut client = Client::connect(&db_connstr, NoTls)?;
// This will reassign all dependent objects to the db owner
let reassign_query = format!(
@@ -349,11 +348,9 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
Ok(())
}
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
let spec = &node.spec;
// Grant CREATE ON DATABASE to the database owner
// to allow clients create trusted extensions.
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
info!("cluster spec grants:");
// We now have a separate `web_access` role to connect to the database
@@ -382,47 +379,5 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
client.execute(query.as_str(), &[])?;
}
// Do some per-database access adjustments. We'd better do this at db creation time,
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
// atomically.
let mut db_connstr = node.connstr.clone();
for db in &node.spec.cluster.databases {
// database name is always the last and the only component of the path
db_connstr.set_path(&db.name);
let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
// This will only change ownership on the schema itself, not the objects
// inside it. Without it owner of the `public` schema will be `cloud_admin`
// and database owner cannot do anything with it. SQL procedure ensures
// that it won't error out if schema `public` doesn't exist.
let alter_query = format!(
"DO $$\n\
DECLARE\n\
schema_owner TEXT;\n\
BEGIN\n\
IF EXISTS(\n\
SELECT nspname\n\
FROM pg_catalog.pg_namespace\n\
WHERE nspname = 'public'\n\
)\n\
THEN\n\
SELECT nspowner::regrole::text\n\
FROM pg_catalog.pg_namespace\n\
WHERE nspname = 'public'\n\
INTO schema_owner;\n\
\n\
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\
THEN\n\
ALTER SCHEMA public OWNER TO {};\n\
END IF;\n\
END IF;\n\
END\n\
$$;",
db.owner.quote()
);
db_client.simple_query(&alter_query)?;
}
Ok(())
}

View File

@@ -403,6 +403,16 @@ impl LocalEnv {
self.pg_distrib_dir.display()
);
}
for binary in ["pageserver", "safekeeper"] {
if !self.zenith_distrib_dir.join(binary).exists() {
bail!(
"Can't find binary '{}' in zenith distrib dir '{}'",
binary,
self.zenith_distrib_dir.display()
);
}
}
for binary in ["pageserver", "safekeeper"] {
if !self.zenith_distrib_dir.join(binary).exists() {
bail!(
@@ -411,6 +421,12 @@ impl LocalEnv {
);
}
}
if !self.pg_distrib_dir.join("bin/postgres").exists() {
bail!(
"Can't find postgres binary at {}",
self.pg_distrib_dir.display()
);
}
fs::create_dir(&base_path)?;

1
docs/.gitignore vendored
View File

@@ -1 +0,0 @@
book

14
docs/README.md Normal file
View File

@@ -0,0 +1,14 @@
# Zenith documentation
## Table of contents
- [authentication.md](authentication.md) — pageserver JWT authentication.
- [docker.md](docker.md) — Docker images and building pipeline.
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [sourcetree.md](sourcetree.md) — Overview of the source tree layout.
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core

View File

@@ -1,84 +0,0 @@
# Summary
[Introduction]()
- [Separation of Compute and Storage](./separation-compute-storage.md)
# Architecture
- [Compute]()
- [WAL proposer]()
- [WAL Backpressure]()
- [Postgres changes](./core_changes.md)
- [Pageserver](./pageserver.md)
- [Services](./pageserver-services.md)
- [Thread management](./pageserver-thread-mgmt.md)
- [WAL Redo](./pageserver-walredo.md)
- [Page cache](./pageserver-pagecache.md)
- [Storage](./pageserver-storage.md)
- [Datadir mapping]()
- [Layer files]()
- [Branching]()
- [Garbage collection]()
- [Cloud Storage]()
- [Processing a GetPage request](./pageserver-processing-getpage.md)
- [Processing WAL](./pageserver-processing-wal.md)
- [Management API]()
- [Tenant Rebalancing]()
- [WAL Service](walservice.md)
- [Consensus protocol](safekeeper-protocol.md)
- [Management API]()
- [Rebalancing]()
- [Control Plane]()
- [Proxy]()
- [Source view](./sourcetree.md)
- [docker.md](./docker.md) — Docker images and building pipeline.
- [Error handling and logging]()
- [Testing]()
- [Unit testing]()
- [Integration testing]()
- [Benchmarks]()
- [Glossary](./glossary.md)
# Uncategorized
- [authentication.md](./authentication.md)
- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [settings.md](./settings.md)
#FIXME: move these under sourcetree.md
#- [pageserver/README.md](/pageserver/README.md)
#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
#- [test_runner/README.md](/test_runner/README.md)
#- [safekeeper/README.md](/safekeeper/README.md)
# RFCs
- [RFCs](./rfcs/README.md)
- [002-storage](rfcs/002-storage.md)
- [003-laptop-cli](rfcs/003-laptop-cli.md)
- [004-durability](rfcs/004-durability.md)
- [005-zenith_local](rfcs/005-zenith_local.md)
- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
- [008-push-pull](rfcs/008-push-pull.md)
- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
- [010-storage_details](rfcs/010-storage_details.md)
- [011-retention-policy](rfcs/011-retention-policy.md)
- [012-background-tasks](rfcs/012-background-tasks.md)
- [013-term-history](rfcs/013-term-history.md)
- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
- [014-storage-lsm](rfcs/014-storage-lsm.md)
- [015-storage-messaging](rfcs/015-storage-messaging.md)
- [016-connection-routing](rfcs/016-connection-routing.md)
- [cluster-size-limits](rfcs/cluster-size-limits.md)

View File

@@ -1,5 +0,0 @@
[book]
language = "en"
multilingual = false
src = "."
title = "Neon architecture"

View File

@@ -1,519 +1,202 @@
# Postgres core changes
1. Add t_cid to XLOG record
- Why?
The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
This lists all the changes that have been made to the PostgreSQL
source tree, as a somewhat logical set of patches. The long-term goal
is to eliminate all these changes, by submitting patches to upstream
and refactoring code into extensions, so that you can run unmodified
PostgreSQL against Neon storage.
To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore.
In Neon, we run PostgreSQL in the compute nodes, but we also run a special WAL redo process in the
page server. We currently use the same binary for both, with --wal-redo runtime flag to launch it in
the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for
the WAL redo process.
- Alternatives?
I don't know
In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the
smgr interface. Once all the core changes have been submitted to upstream or eliminated some other
way, the extension could live outside the postgres repository and build against vanilla PostgreSQL.
2. Add PD_WAL_LOGGED.
- Why?
Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function.
Below is a list of all the PostgreSQL source code changes, categorized into changes needed for
compute, and changes needed for the WAL redo process:
There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set.
# Changes for Compute node
- Discussion:
https://discord.com/channels/869525774699462656/882681420986851359
## Add t_cid to heap WAL records
- Alternatives:
Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead.
```
src/backend/access/heap/heapam.c | 26 +-
src/include/access/heapam_xlog.h | 6 +-
```
We have added a new t_cid field to heap WAL records. This changes the WAL record format, making Neon WAL format incompatible with vanilla PostgreSQL!
3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo.
- Why?
XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres.
### Problem we're trying to solve
- Alternatives?
No
The problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works in PostgreSQL, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares about it anymore. But with Neon, we rely on WAL replay to reconstruct the page, even while the original transaction is still running.
### How to get rid of the patch
4. Eliminate reporting of some warnings related with hint bits, for example
"page is not marked all-visible but visibility map bit is set in relation".
- Why?
Hint bit may be not WAL logged.
Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information.
- Alternative?
Always wal log any page changes.
### Alternatives
Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.
5. Maintain last written LSN.
- Why?
When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN
of WAL record performing last update of this pages. But we do not know it, because we do not have page.
We can use current WAL flush position, but in this case there is high probability that page server
will be blocked until this peace of WAL is delivered.
As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages,
but SMGR API doesn't provide such knowledge.
## ginfast.c
- Alternatives?
Maintain map of LSNs of evicted pages.
```
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index e0d9940946..2d964c02e9 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
memset(&sublist, 0, sizeof(GinMetaPageData));
makeSublist(index, collector->tuples, collector->ntuples, &sublist);
+ if (metadata->head != InvalidBlockNumber)
+ {
+ /*
+ * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
+ * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
+ * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
+ * will try to WAL-log an image of the page.
+ */
+ buffer = ReadBuffer(index, metadata->tail);
+ }
+
if (needWal)
XLogBeginInsert();
@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
data.prevTail = metadata->tail;
data.newRightlink = sublist.head;
- buffer = ReadBuffer(index, metadata->tail);
LockBuffer(buffer, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
```
The problem is explained in the comment above
6. Launching Postgres without WAL.
- Why?
According to Zenith architecture compute node is stateless. So when we are launching
compute node, we need to provide some dummy PG_DATADIR. Relation pages
can be requested on demand from page server. But Postgres still need some non-relational data:
control and configuration files, SLRUs,...
It is currently implemented using basebackup (do not mix with pg_basebackup) which is created
by pageserver. It includes in this tarball config/control files, SLRUs and required directories.
As far as pageserver do not have original (non-scattered) WAL segments, it includes in
this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment,
which redo field points to the end of wal. It allows to load checkpoint record in more or less
standard way with minimal changes of Postgres, but then some special handling is needed,
including restoring previous record position from zenith.signal file.
Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo)
to pass checks performed by XLogReader.
### How to get rid of the patch
- Alternatives?
We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record
in special way. But it may only increase number of changes in xlog.c
Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical
section or something.
7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended
- Why?
We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver.
So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE
which means that recovery for them is not needed.
Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images?
- Alternatives?
No
8. Enforce WAL logging of sequence updates.
- Why?
Due to performance reasons Postgres don't want to log each fetching of a value from a sequence,
so we pre-log a few fetches in advance. In the event of crash we can lose
(skip over) as many values as we pre-logged.
But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache
and we will get a gap in sequence values even without crash.
## Mark index builds that use buffer manager without logging explicitly
- Alternatives:
Do not try to preserve sequential order but avoid performance penalty.
```
src/backend/access/gin/gininsert.c | 7 +
src/backend/access/gist/gistbuild.c | 15 +-
src/backend/access/spgist/spginsert.c | 8 +-
also some changes in src/backend/storage/smgr/smgr.c
```
9. Treat unlogged tables as normal (permanent) tables.
- Why?
Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables).
But as far as compute node is stateless, we need to persist their data to storage node.
And it can only be done through the WAL.
When a GIN index is built, for example, it is built by inserting the entries into the index more or
less normally, but without WAL-logging anything. After the index has been built, we iterate through
all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged
and is evicted from the buffer cache, it is lost. We have an check to catch that in the Neon
extension. To fix that, we've added a few functions to track explicitly when we're performing such
an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` and
`smgr_end_unlogged_build`.
### How to get rid of the patch
I think it would make sense to be more explicit about that in PostgreSQL too. So extract these
changes to a patch and post to pgsql-hackers.
- Alternatives?
* Store unlogged tables locally (violates requirement of stateless compute nodes).
* Prohibit unlogged tables at all.
## Track last-written page LSN
10. Support start Postgres in wal-redo mode
- Why?
To be able to apply WAL record and reconstruct pages at page server.
```
src/backend/commands/dbcommands.c | 17 +-
- Alternatives?
* Rewrite redo handlers in Rust
* Do not reconstruct pages at page server at all and do it at compute node.
Also one call to SetLastWrittenPageLSN() in spginsert.c, maybe elsewhere too
```
Whenever a page is evicted from the buffer cache, we remember its LSN, so that we can use the same
LSN in the GetPage@LSN request when reading the page back from the page server. The value is
conservative: it would be correct to always use the last-inserted LSN, but it would be slow because
then the page server would need to wait for the recent WAL to be streamed and processed, before
responding to any GetPage@LSN request.
11. WAL proposer
- Why?
WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.
It is currently implemented as patch to standard WAL sender.
The last-written page LSN is mostly tracked in the smgrwrite() function, without core code changes,
but there are a few exceptions where we've had to add explicit calls to the Neon-specific
SetLastWrittenPageLSN() function.
- Alternatives?
Can be moved to extension if some extra callbacks will be added to wal sender code.
There's an open PR to track the LSN in a more-fine grained fashion:
https://github.com/neondatabase/postgres/pull/177
PostgreSQL v15 introduces a new method to do CREATE DATABASE that WAL-logs the database instead of
relying copying files and checkpoint. With that method, we probably won't need any special handling.
The old method is still available, though.
12. Secure Computing BPF API wrapper.
- Why?
Pageserver delegates complex WAL decoding duties to Postgres,
which means that the latter might fall victim to carefully designed
malicious WAL records and start doing harmful things to the system.
To prevent this, it has been decided to limit possible interactions
with the outside world using the Secure Computing BPF mode.
- Alternatives:
* Rewrite redo handlers in Rust.
* Add more checks to guarantee correctness of WAL records.
* Move seccomp.c to extension
* Many other discussed approaches to neutralize incorrect WAL records vulnerabilities.
13. Callbacks for replica feedbacks
- Why?
Allowing waproposer to interact with walsender code.
- Alternatives
Copy walsender code to walproposer.
14. Support multiple SMGR implementations.
- Why?
Postgres provides abstract API for storage manager but it has only one implementation
and provides no way to replace it with custom storage manager.
- Alternatives?
None.
15. Calculate database size as sum of all database relations.
- Why?
Postgres is calculating database size by traversing data directory
but as far as Zenith compute node is stateless we can not do it.
- Alternatives?
Send this request directly to pageserver and calculate real (physical) size
of Zenith representation of database/timeline, rather than sum logical size of all relations.
### How to get rid of the patch
Wait until v15?
-----------------------------------------------
Not currently committed but proposed:
1. Disable ring buffer buffer manager strategies
- Why?
Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
Even if there are free space in buffer cache, pages may be evicted.
Negative effect of it can be somehow compensated by file system cache, but in case of Zenith
cost of requesting page from page server is much higher.
## Cache relation sizes
- Alternatives?
Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
for example copy evicted page from ring buffer to some other buffer if there is free space
in buffer cache.
The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going
to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the
relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for
Neon)
2. Disable marking page as dirty when hint bits are set.
- Why?
Postgres has to modify page twice: first time when some tuple is updated and second time when
hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
- Alternatives?
Add special WAL record for setting page hints.
## Misc change in vacuumlazy.c
3. Prefetching
- Why?
As far as pages in Zenith are loaded on demand, to reduce node startup time
and also speedup some massive queries we need some mechanism for bulk loading to
reduce page request round-trip overhead.
```
index 8aab6e324e..c684c4fbee 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
else if (all_visible_according_to_vm && !PageIsAllVisible(page)
&& VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
{
- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+ /* ZENITH-XXX: all visible hint is not wal-logged
+ * FIXME: Replay visibilitymap changes in pageserver
+ */
+ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
vacrel->relname, blkno);
visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
VISIBILITYMAP_VALID_BITS);
```
Currently Postgres is supporting prefetching only for bitmap scan.
In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch
some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs.
Is this still needed? If that WARNING happens, it looks like potential corruption that we should
fix!
## Use buffer manager when extending VM or FSM
```
src/backend/storage/freespace/freespace.c | 14 +-
src/backend/access/heap/visibilitymap.c | 15 +-
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index e198df65d8..addfe93eac 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
/* Now extend the file */
while (vm_nblocks_now < vm_nblocks)
{
- PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
+ /*
+ * ZENITH: Initialize VM pages through buffer cache to prevent loading
+ * them from pageserver.
+ */
+ Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
+ RBM_ZERO_AND_LOCK, NULL);
+ Page page = BufferGetPage(buffer);
+
+ PageInit((Page) page, BLCKSZ, 0);
+ PageSetChecksumInplace(page, vm_nblocks_now);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
- smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
- pg.data, false);
vm_nblocks_now++;
}
```
### Problem we're trying to solve
???
### How to get rid of the patch
Maybe this would be a reasonable change in PostgreSQL too?
## Allow startup without reading checkpoint record
In Neon, the compute node is stateless. So when we are launching compute node, we need to provide
some dummy PG_DATADIR. Relation pages can be requested on demand from page server. But Postgres
still need some non-relational data: control and configuration files, SLRUs,... It is currently
implemented using basebackup (do not mix with pg_basebackup) which is created by pageserver. It
includes in this tarball config/control files, SLRUs and required directories.
As pageserver does not have the original WAL segments, the basebackup tarball includes an empty WAL
segment to bootstrap the WAL writing, but it doesn't contain the checkpoint record. There are some
changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
from WAL.
This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
### How to get rid of the patch
???
### Alternatives
Include a fake checkpoint record in the tarball. Creating fake WAL is a bit risky, though; I'm
afraid it might accidentally get streamed to the safekeepers and overwrite or corrupt the real WAL.
## Disable sequence caching
```
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 0415df9ccb..9f9db3c8bc 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -53,7 +53,9 @@
* so we pre-log a few fetches in advance. In the event of
* crash we can lose (skip over) as many values as we pre-logged.
*/
-#define SEQ_LOG_VALS 32
+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* #define SEQ_LOG_VALS 32 */
+#define SEQ_LOG_VALS 0
```
Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, so
it pre-logs a few fetches in advance. In the event of crash we can lose (skip over) as many values
as we pre-logged. But with Neon, because page with sequence value can be evicted from buffer cache,
we can get a gap in sequence values even without crash.
### How to get rid of the patch
Maybe we can just remove it, and accept the gaps. Or add some special handling for sequence
relations in the Neon extension, to WAL log the sequence page when it's about to be evicted. It
would be weird if the sequence moved backwards though, think of PITR.
Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.
## Walproposer
```
src/Makefile | 1 +
src/backend/replication/libpqwalproposer/Makefile | 37 +
src/backend/replication/libpqwalproposer/libpqwalproposer.c | 416 ++++++++++++
src/backend/postmaster/bgworker.c | 4 +
src/backend/postmaster/postmaster.c | 6 +
src/backend/replication/Makefile | 4 +-
src/backend/replication/walproposer.c | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
src/backend/replication/walproposer_utils.c | 402 +++++++++++
src/backend/replication/walreceiver.c | 7 +
src/backend/replication/walsender.c | 320 ++++++---
src/backend/storage/ipc/ipci.c | 6 +
src/include/replication/walproposer.h | 565 ++++++++++++++++
```
WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. It is
currently implemented as patch to standard WAL sender.
### How to get rid of the patch
Refactor into an extension. Submit hooks or APIs into upstream if necessary.
@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96
## Ignore unexpected data beyond EOF in bufmgr.c
```
@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (!PageIsNew((Page) bufBlock))
- ereport(ERROR,
+ {
+ // XXX-ZENITH
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+ ereport(DEBUG1,
(errmsg("unexpected data beyond EOF in block %u of relation %s",
blockNum, relpath(smgr->smgr_rnode, forkNum)),
errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
-
+ }
/*
* We *must* do smgrextend before succeeding, else the page will not
* be reserved by the kernel, and the next P_NEW call will decide to
```
PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros
first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend
a relation at the same time, the pages can be WAL-logged in different order.
I'm not sure what scenario exactly required this change in Neon, though.
### How to get rid of the patch
Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit
confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation,
and finally WAL-log that the extension succeeded.
## Make smgr interface available to extensions
```
src/backend/storage/smgr/smgr.c | 203 +++---
src/include/storage/smgr.h | 72 +-
```
### How to get rid of the patch
Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression.
## Added relpersistence argument to smgropen()
```
src/backend/access/heap/heapam_handler.c | 2 +-
src/backend/catalog/storage.c | 10 +-
src/backend/commands/tablecmds.c | 2 +-
src/backend/storage/smgr/md.c | 4 +-
src/include/utils/rel.h | 3 +-
```
Neon needs to treat unlogged relations differently from others, so the smgrread(), smgrwrite() etc.
implementations need to know the 'relpersistence' of the relation. To get that information where
it's needed, we added the 'relpersistence' field to smgropen().
### How to get rid of the patch
Maybe 'relpersistence' would be useful in PostgreSQL for debugging purposes? Or simply for the
benefit of extensions like Neon. Should consider this in the patch to make smgr API usable to
extensions.
## Alternatives
Currently in Neon, unlogged tables live on local disk in the compute node, and are wiped away on
compute node restart. One alternative would be to instead WAL-log even unlogged tables, essentially
ignoring the UNLOGGED option. Or prohibit UNLOGGED tables completely. But would we still need the
relpersistence argument to handle index builds? See item on "Mark index builds that use buffer
manager without logging explicitly".
## Use smgr and dbsize_hook for size calculations
```
src/backend/utils/adt/dbsize.c | 61 +-
```
In PostgreSQL, the rel and db-size functions scan the data directory directly. That won't work in Neon.
### How to get rid of the patch
Send patch to PostgreSQL, to use smgr API functions for relation size calculation instead. Maybe as
part of the general smgr API patch.
# WAL redo process changes
Pageserver delegates complex WAL decoding duties to Postgres, which means that the latter might fall
victim to carefully designed malicious WAL records and start doing harmful things to the system. To
prevent this, the redo functions are executed in a separate process that is sandboxed with Linux
Secure Computing mode (see seccomp(2) man page).
As an alternative to having a separate WAL redo process, we could rewrite all redo handlers in Rust
This is infeasible. However, it would take a lot of effort to rewrite them, ensure that you've done
the rewrite correctly, and once you've done that, it would be a lot of ongoing maintenance effort to
keep the rewritten code in sync over time, across new PostgreSQL versions. That's why we want to
leverage PostgreSQL code.
Another alternative would be to harden all the PostgreSQL WAL redo functions so that it would be
safe to call them directly from Rust code, without needing the security sandbox. That's not feasible
for similar reasons as rewriting them in Rust.
## Don't replay change in XLogReadBufferForRedo that are not for the target page we're replaying
```
src/backend/access/gin/ginxlog.c | 19 +-
Also some changes in xlog.c and xlogutils.c
Example:
@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record)
if (!isLeaf)
ginRedoClearIncompleteSplit(record, 3);
- if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+ action = XLogReadBufferForRedo(record, 0, &lbuffer);
+ if (action != BLK_RESTORED && action != BLK_DONE)
elog(ERROR, "GIN split record did not contain a full-page image of left page");
```
### Problem we're trying to solve
In PostgreSQL, if a WAL redo function calls XLogReadBufferForRead() for a page that has a full-page
image, it always succeeds. However, Neon WAL redo process is only concerned about replaying changes
to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified
XLogReadBufferForRead() to return BLK_DONE for all other pages, to avoid the overhead. That is
unexpected by code like the above.
### How to get rid of the patch
Submit the changes to upstream, hope the community accepts them. There's no harm to PostgreSQL from
these changes, although it doesn't have any benefit either.
To make these changes useful to upstream PostgreSQL, we could implement a feature to look ahead the
WAL, and detect truncated relations. Even in PostgreSQL, it is a waste of cycles to replay changes
to pages that are later truncated away, so we could have XLogReadBufferForRedo() return BLK_DONE or
BLK_NOTFOUND for pages that are known to be truncated away later in the WAL stream.
### Alternatives
Maybe we could revert this optimization, and restore pages other than the target page too.
## Add predefined_sysidentifier flag to initdb
```
src/backend/bootstrap/bootstrap.c | 13 +-
src/bin/initdb/initdb.c | 4 +
And some changes in xlog.c
```
This is used to help with restoring a database when you have all the WAL, all the way back to
initdb, but no backup. You can reconstruct the missing backup by running initdb again, with the same
sysidentifier.
### How to get rid of the patch
Ignore it. This is only needed for disaster recovery, so once we've eliminated all other Postgres
patches, we can just keep it around as a patch or as separate branch in a repo.
# Not currently committed but proposed
## Disable ring buffer buffer manager strategies
### Why?
Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
Even if there are free space in buffer cache, pages may be evicted.
Negative effect of it can be somehow compensated by file system cache, but in Neon,
cost of requesting page from page server is much higher.
### Alternatives?
Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
for example copy evicted page from ring buffer to some other buffer if there is free space
in buffer cache.
## Disable marking page as dirty when hint bits are set.
### Why?
Postgres has to modify page twice: first time when some tuple is updated and second time when
hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
### Alternatives?
Add special WAL record for setting page hints.
## Prefetching
### Why?
As far as pages in Neon are loaded on demand, to reduce node startup time
and also speedup some massive queries we need some mechanism for bulk loading to
reduce page request round-trip overhead.
Currently Postgres is supporting prefetching only for bitmap scan.
In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us.
For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages
of heap relation addressed by TIDs.
## Prewarming
### Why?
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
We can capture state of compute node buffer cache and send bulk request for this pages at startup.
4. Prewarming.
- Why?
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
We can capture state of compute node buffer cache and send bulk request for this pages at startup.

View File

@@ -1,9 +0,0 @@
# Page Service
The Page Service listens for GetPage@LSN requests from the Compute Nodes,
and responds with pages from the repository. On each GetPage@LSN request,
it calls into the Repository function
A separate thread is spawned for each incoming connection to the page
service. The page service uses the libpq protocol to communicate with
the client. The client is a Compute Postgres instance.

View File

@@ -1,8 +0,0 @@
# Page cache
TODO:
- shared across tenants
- store pages from layer files
- store pages from "in-memory layer"
- store materialized pages

View File

@@ -1,4 +0,0 @@
# Processing a GetPage request
TODO:
- sequence diagram that shows how a GetPage@LSN request is processed

View File

@@ -1,5 +0,0 @@
# Processing WAL
TODO:
- diagram that shows how incoming WAL is processed
- explain durability, what is fsync'd when, disk_consistent_lsn

View File

@@ -1,26 +0,0 @@
## Thread management
Each thread in the system is tracked by the `thread_mgr` module. It
maintains a registry of threads, and which tenant or timeline they are
operating on. This is used for safe shutdown of a tenant, or the whole
system.
### Handling shutdown
When a tenant or timeline is deleted, we need to shut down all threads
operating on it, before deleting the data on disk. A thread registered
in the thread registry can check if it has been requested to shut down,
by calling `is_shutdown_requested()`. For async operations, there's also
a `shudown_watcher()` async task that can be used to wake up on shutdown.
### Sync vs async
The primary programming model in the page server is synchronous,
blocking code. However, there are some places where async code is
used. Be very careful when mixing sync and async code.
Async is primarily used to wait for incoming data on network
connections. For example, all WAL receivers have a shared thread pool,
with one async Task for each connection. Once a piece of WAL has been
received from the network, the thread calls the blocking functions in
the Repository to process the WAL.

View File

@@ -1,77 +0,0 @@
# WAL Redo
To reconstruct a particular page version from an image of the page and
some WAL records, the pageserver needs to replay the WAL records. This
happens on-demand, when a GetPage@LSN request comes in, or as part of
background jobs that reorganize data for faster access.
It's important that data cannot leak from one tenant to another, and
that a corrupt WAL record on one timeline doesn't affect other tenants
or timelines.
## Multi-tenant security
If you have direct access to the WAL directory, or if you have
superuser access to a running PostgreSQL server, it's easy to
construct a malicious or corrupt WAL record that causes the WAL redo
functions to crash, or to execute arbitrary code. That is not a
security problem for PostgreSQL; if you have superuser access, you
have full access to the system anyway.
The Neon pageserver, however, is multi-tenant. It needs to execute WAL
belonging to different tenants in the same system, and malicious WAL
in one tenant must not affect other tenants.
A separate WAL redo process is launched for each tenant, and the
process uses the seccomp(2) system call to restrict its access to the
bare minimum needed to replay WAL records. The process does not have
access to the filesystem or network. It can only communicate with the
parent pageserver process through a pipe.
If an attacker creates a malicious WAL record and injects it into the
WAL stream of a timeline, he can take control of the WAL redo process
in the pageserver. However, the WAL redo process cannot access the
rest of the system. And because there is a separate WAL redo process
for each tenant, the hijacked WAL redo process can only see WAL and
data belonging to the same tenant, which the attacker would have
access to anyway.
## WAL-redo process communication
The WAL redo process runs the 'postgres' executable, launched with a
Neon-specific command-line option to put it into WAL-redo process
mode. The pageserver controls the lifetime of the WAL redo processes,
launching them as needed. If a tenant is detached from the pageserver,
any WAL redo processes for that tenant are killed.
The pageserver communicates with each WAL redo process over its
stdin/stdout/stderr. It works in request-response model with a simple
custom protocol, described in walredo.rs. To replay a set of WAL
records for a page, the pageserver sends the "before" image of the
page and the WAL records over 'stdin', followed by a command to
perform the replay. The WAL redo process responds with an "after"
image of the page.
## Special handling of some records
Some WAL record types are handled directly in the pageserver, by
bespoken Rust code, and are not sent over to the WAL redo process.
This includes SLRU-related WAL records, like commit records. SLRUs
don't use the standard Postgres buffer manager, so dealing with them
in the Neon WAL redo mode would require quite a few changes to
Postgres code and special handling in the protocol anyway.
Some record types that include a full-page-image (e.g. XLOG_FPI) are
also handled specially when incoming WAL is processed already, and are
stored as page images rather than WAL records.
## Records that modify multiple pages
Some Postgres WAL records modify multiple pages. Such WAL records are
duplicated, so that a copy is stored for each affected page. This is
somewhat wasteful, but because most WAL records only affect one page,
the overhead is acceptable.
The WAL redo always happens for one particular page. If the WAL record
coantains changes to other pages, they are ignored.

View File

@@ -1,11 +0,0 @@
# Page server architecture
The Page Server has a few different duties:
- Respond to GetPage@LSN requests from the Compute Nodes
- Receive WAL from WAL safekeeper, and store it
- Upload data to S3 to make it durable, download files from S3 as needed
S3 is the main fault-tolerant storage of all data, as there are no Page Server
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
keeps track of WAL records which are not synced to S3 yet.

View File

@@ -1,8 +0,0 @@
# Separation of Compute and Storage
TODO:
- Read path
- Write path
- Durability model
- API auth

View File

@@ -23,7 +23,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
[dev-dependencies]
env_logger = "0.9"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
wal_craft = { path = "wal_craft" }
wal_generate = { path = "wal_generate" }
[build-dependencies]
bindgen = "0.59.1"

View File

@@ -2,7 +2,6 @@ extern crate bindgen;
use std::env;
use std::path::PathBuf;
use std::process::Command;
use bindgen::callbacks::ParseCallbacks;
@@ -46,43 +45,6 @@ fn main() {
// Tell cargo to invalidate the built crate whenever the wrapper changes
println!("cargo:rerun-if-changed=pg_control_ffi.h");
// Finding the location of C headers for the Postgres server:
// - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
// - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/tmp_install/include/postgresql/server`
let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR")
{
postgres_install_dir.into()
} else {
PathBuf::from("tmp_install")
};
if pg_install_dir.is_relative() {
let cwd = env::current_dir().unwrap();
pg_install_dir = cwd.join("..").join("..").join(pg_install_dir);
}
let pg_config_bin = pg_install_dir.join("bin").join("pg_config");
let inc_server_path: String = if pg_config_bin.exists() {
let output = Command::new(pg_config_bin)
.arg("--includedir-server")
.output()
.expect("failed to execute `pg_config --includedir-server`");
if !output.status.success() {
panic!("`pg_config --includedir-server` failed")
}
String::from_utf8(output.stdout).unwrap().trim_end().into()
} else {
pg_install_dir
.join("include")
.join("postgresql")
.join("server")
.into_os_string()
.into_string()
.unwrap()
};
// The bindgen::Builder is the main entry point
// to bindgen, and lets you build up options for
// the resulting bindings.
@@ -119,7 +81,15 @@ fn main() {
// explicit padding fields.
.explicit_padding(true)
//
.clang_arg(format!("-I{inc_server_path}"))
// Path the server include dir. It is in tmp_install/include/server, if you did
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
// and used DESTDIR to move it into tmp_install, then it's in
// tmp_install/include/postgres/server
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
// but this will do for now.
//
.clang_arg("-I../../tmp_install/include/server")
.clang_arg("-I../../tmp_install/include/postgresql/server")
//
// Finish the builder and generate the bindings.
//

View File

@@ -82,17 +82,7 @@ impl WalStreamDecoder {
// that cross page boundaries.
loop {
// parse and verify page boundaries as we go
if self.padlen > 0 {
// We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record.
if self.inputbuf.remaining() < self.padlen as usize {
return Ok(None);
}
// skip padding
self.inputbuf.advance(self.padlen as usize);
self.lsn += self.padlen as u64;
self.padlen = 0;
} else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
// parse long header
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
@@ -138,6 +128,15 @@ impl WalStreamDecoder {
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
continue;
} else if self.padlen > 0 {
if self.inputbuf.remaining() < self.padlen as usize {
return Ok(None);
}
// skip padding
self.inputbuf.advance(self.padlen as usize);
self.lsn += self.padlen as u64;
self.padlen = 0;
} else if self.contlen == 0 {
assert!(self.recordbuf.is_empty());
@@ -227,10 +226,10 @@ impl WalStreamDecoder {
self.padlen = self.lsn.calc_padding(8u32) as u32;
}
// We should return LSN of the next record, not the last byte of this record or
// the byte immediately after. Note that this handles both XLOG_SWITCH and usual
// records, the former "spans" until the next WAL segment (see test_xlog_switch).
let result = (self.lsn + self.padlen as u64, recordbuf);
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
// and WalReceiver integration. Since this code is used both for WalReceiver and
// initial WAL import let's force alignment right here.
let result = (self.lsn.align(), recordbuf);
Ok(Some(result))
}
}

View File

@@ -15,7 +15,6 @@ use crate::XLogPageHeaderData;
use crate::XLogRecord;
use crate::XLOG_PAGE_MAGIC;
use crate::pg_constants::WAL_SEGMENT_SIZE;
use anyhow::{bail, ensure};
use byteorder::{ByteOrder, LittleEndian};
use bytes::BytesMut;
@@ -462,7 +461,8 @@ pub fn find_end_of_wal(
pub fn main() {
let mut data_dir = PathBuf::new();
data_dir.push(".");
let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap();
let wal_seg_size = 16 * 1024 * 1024;
let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true, Lsn(0)).unwrap();
println!(
"wal_end={:>08X}{:>08X}, tli={}",
(wal_end >> 32) as u32,
@@ -597,18 +597,20 @@ mod tests {
fn init_logging() {
let _ = env_logger::Builder::from_env(
env_logger::Env::default()
.default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"),
.default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"),
)
.is_test(true)
.try_init();
}
fn test_end_of_wal<C: wal_craft::Crafter>(
fn test_end_of_wal(
test_name: &str,
generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result<postgres::types::PgLsn>,
expected_end_of_wal_non_partial: Lsn,
last_segment: &str,
) {
use wal_craft::*;
// Craft some WAL
use wal_generate::*;
// 1. Generate some WAL
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("..")
.join("..");
@@ -620,72 +622,25 @@ mod tests {
fs::remove_dir_all(&cfg.datadir).unwrap();
}
cfg.initdb().unwrap();
let srv = cfg.start_server().unwrap();
let (intermediate_lsns, expected_end_of_wal_partial) =
C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
let intermediate_lsns: Vec<Lsn> = intermediate_lsns
.iter()
.map(|&lsn| u64::from(lsn).into())
.collect();
let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into();
let mut srv = cfg.start_server().unwrap();
let expected_wal_end: Lsn =
u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
srv.kill();
// Check find_end_of_wal on the initial WAL
let last_segment = cfg
.wal_dir()
.read_dir()
.unwrap()
.map(|f| f.unwrap().file_name().into_string().unwrap())
.filter(|fname| IsXLogFileName(fname))
.max()
.unwrap();
check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial);
for start_lsn in std::iter::once(Lsn(0))
.chain(intermediate_lsns)
.chain(std::iter::once(expected_end_of_wal_partial))
{
// Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
// We assume that `start_lsn` is non-decreasing.
info!(
"Checking with start_lsn={}, erasing WAL before it",
start_lsn
);
for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
let fname = file.file_name().into_string().unwrap();
if !IsXLogFileName(&fname) {
continue;
}
let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
if seg_start_lsn > u64::from(start_lsn) {
continue;
}
let mut f = File::options().write(true).open(file.path()).unwrap();
const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
f.write_all(
&ZEROS[0..min(
WAL_SEGMENT_SIZE,
(u64::from(start_lsn) - seg_start_lsn) as usize,
)],
)
.unwrap();
}
check_end_of_wal(
&cfg,
&last_segment,
start_lsn,
expected_end_of_wal_non_partial,
expected_end_of_wal_partial,
);
}
}
// 2. Pick WAL generated by initdb
let wal_dir = cfg.datadir.join("pg_wal");
let wal_seg_size = 16 * 1024 * 1024;
fn check_pg_waldump_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
expected_end_of_wal: Lsn,
) {
// Get the actual end of WAL by pg_waldump
// 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
let wal_end = Lsn(wal_end);
info!(
"find_end_of_wal returned (wal_end={}, tli={})",
wal_end, tli
);
assert_eq!(wal_end, expected_end_of_wal_non_partial);
// 4. Get the actual end of WAL by pg_waldump
let waldump_output = cfg
.pg_waldump("000000010000000000000001", last_segment)
.unwrap()
@@ -704,66 +659,44 @@ mod tests {
let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
info!(
"waldump erred on {}, expected wal end at {}",
waldump_wal_end, expected_end_of_wal
waldump_wal_end, expected_wal_end
);
assert_eq!(waldump_wal_end, expected_end_of_wal);
}
assert_eq!(waldump_wal_end, expected_wal_end);
fn check_end_of_wal(
cfg: &wal_craft::Conf,
last_segment: &str,
start_lsn: Lsn,
expected_end_of_wal_non_partial: Lsn,
expected_end_of_wal_partial: Lsn,
) {
// Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
let (wal_end, tli) =
find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
let wal_end = Lsn(wal_end);
info!(
"find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment",
wal_end, tli
);
assert_eq!(wal_end, expected_end_of_wal_non_partial);
// Rename file to partial to actually find last valid lsn, then rename it back.
// 5. Rename file to partial to actually find last valid lsn
fs::rename(
cfg.wal_dir().join(&last_segment),
cfg.wal_dir().join(format!("{}.partial", last_segment)),
wal_dir.join(last_segment),
wal_dir.join(format!("{}.partial", last_segment)),
)
.unwrap();
let (wal_end, tli) =
find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
let wal_end = Lsn(wal_end);
info!(
"find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment",
"find_end_of_wal returned (wal_end={}, tli={})",
wal_end, tli
);
assert_eq!(wal_end, expected_end_of_wal_partial);
fs::rename(
cfg.wal_dir().join(format!("{}.partial", last_segment)),
cfg.wal_dir().join(last_segment),
)
.unwrap();
assert_eq!(wal_end, waldump_wal_end);
}
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
#[test]
pub fn test_find_end_of_wal_simple() {
init_logging();
test_end_of_wal::<wal_craft::Simple>(
test_end_of_wal(
"test_find_end_of_wal_simple",
wal_generate::generate_simple,
"0/2000000".parse::<Lsn>().unwrap(),
"000000010000000000000001",
);
}
#[test]
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
init_logging();
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
test_end_of_wal(
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
wal_generate::generate_wal_record_crossing_segment_followed_by_small_one,
"0/3000000".parse::<Lsn>().unwrap(),
"000000010000000000000002",
);
}
@@ -771,9 +704,11 @@ mod tests {
#[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
pub fn test_find_end_of_wal_last_crossing_segment() {
init_logging();
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
test_end_of_wal(
"test_find_end_of_wal_last_crossing_segment",
wal_generate::generate_last_wal_record_crossing_segment,
"0/3000000".parse::<Lsn>().unwrap(),
"000000010000000000000002",
);
}

View File

@@ -1,103 +0,0 @@
use anyhow::*;
use clap::{App, Arg, ArgMatches};
use std::str::FromStr;
use wal_craft::*;
fn main() -> Result<()> {
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
.init();
let type_arg = &Arg::new("type")
.takes_value(true)
.help("Type of WAL to craft")
.possible_values([
Simple::NAME,
LastWalRecordXlogSwitch::NAME,
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
WalRecordCrossingSegmentFollowedBySmallOne::NAME,
LastWalRecordCrossingSegment::NAME,
])
.required(true);
let arg_matches = App::new("Postgres WAL crafter")
.about("Crafts Postgres databases with specific WAL properties")
.subcommand(
App::new("print-postgres-config")
.about("Print the configuration required for PostgreSQL server before running this script")
)
.subcommand(
App::new("with-initdb")
.about("Craft WAL in a new data directory first initialized with initdb")
.arg(type_arg)
.arg(
Arg::new("datadir")
.takes_value(true)
.help("Data directory for the Postgres server")
.required(true)
)
.arg(
Arg::new("pg-distrib-dir")
.long("pg-distrib-dir")
.takes_value(true)
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
.default_value("/usr/local")
)
)
.subcommand(
App::new("in-existing")
.about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
.arg(type_arg)
.arg(
Arg::new("connection")
.takes_value(true)
.help("Connection string to the Postgres database to populate")
.required(true)
)
)
.get_matches();
let wal_craft = |arg_matches: &ArgMatches, client| {
let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() {
Simple::NAME => Simple::craft(client)?,
LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)?
}
WalRecordCrossingSegmentFollowedBySmallOne::NAME => {
WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
}
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
a => panic!("Unknown --type argument: {}", a),
};
for lsn in intermediate_lsns {
println!("intermediate_lsn = {}", lsn);
}
println!("end_of_wal = {}", end_of_wal_lsn);
Ok(())
};
match arg_matches.subcommand() {
None => panic!("No subcommand provided"),
Some(("print-postgres-config", _)) => {
for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
println!("{}", cfg);
}
Ok(())
}
Some(("with-initdb", arg_matches)) => {
let cfg = Conf {
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
datadir: arg_matches.value_of("datadir").unwrap().into(),
};
cfg.initdb()?;
let srv = cfg.start_server()?;
wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?;
srv.kill();
Ok(())
}
Some(("in-existing", arg_matches)) => wal_craft(
arg_matches,
&mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
.connect(postgres::NoTls)?,
),
Some(_) => panic!("Unknown subcommand"),
}
}

View File

@@ -1,5 +1,5 @@
[package]
name = "wal_craft"
name = "wal_generate"
version = "0.1.0"
edition = "2021"
@@ -10,7 +10,5 @@ anyhow = "1.0"
clap = "3.0"
env_logger = "0.9"
log = "0.4"
once_cell = "1.8.0"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
postgres_ffi = { path = "../" }
tempfile = "3.2"

View File

@@ -0,0 +1,58 @@
use anyhow::*;
use clap::{App, Arg};
use wal_generate::*;
fn main() -> Result<()> {
env_logger::Builder::from_env(
env_logger::Env::default().default_filter_or("wal_generate=info"),
)
.init();
let arg_matches = App::new("Postgres WAL generator")
.about("Generates Postgres databases with specific WAL properties")
.arg(
Arg::new("datadir")
.short('D')
.long("datadir")
.takes_value(true)
.help("Data directory for the Postgres server")
.required(true)
)
.arg(
Arg::new("pg-distrib-dir")
.long("pg-distrib-dir")
.takes_value(true)
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
.default_value("/usr/local")
)
.arg(
Arg::new("type")
.long("type")
.takes_value(true)
.help("Type of WAL to generate")
.possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
.required(true)
)
.get_matches();
let cfg = Conf {
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
datadir: arg_matches.value_of("datadir").unwrap().into(),
};
cfg.initdb()?;
let mut srv = cfg.start_server()?;
let lsn = match arg_matches.value_of("type").unwrap() {
"simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
"last_wal_record_crossing_segment" => {
generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
}
"wal_record_crossing_segment_followed_by_small_one" => {
generate_wal_record_crossing_segment_followed_by_small_one(
&mut srv.connect_with_timeout()?,
)?
}
a => panic!("Unknown --type argument: {}", a),
};
println!("end_of_wal = {}", lsn);
srv.kill();
Ok(())
}

View File

@@ -1,13 +1,8 @@
use anyhow::*;
use core::time::Duration;
use log::*;
use once_cell::sync::Lazy;
use postgres::types::PgLsn;
use postgres::Client;
use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE;
use postgres_ffi::xlog_utils::{
XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
};
use std::cmp::Ordering;
use std::fs;
use std::path::{Path, PathBuf};
@@ -27,16 +22,6 @@ pub struct PostgresServer {
client_config: postgres::Config,
}
pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
vec![
"wal_keep_size=50MB", // Ensure old WAL is not removed
"shared_preload_libraries=neon", // can only be loaded at startup
// Disable background processes as much as possible
"wal_writer_delay=10s",
"autovacuum=off",
]
});
impl Conf {
fn pg_bin_dir(&self) -> PathBuf {
self.pg_distrib_dir.join("bin")
@@ -46,10 +31,6 @@ impl Conf {
self.pg_distrib_dir.join("lib")
}
pub fn wal_dir(&self) -> PathBuf {
self.datadir.join("pg_wal")
}
fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
let path = self.pg_bin_dir().join(command);
ensure!(path.exists(), "Command {:?} does not exist", path);
@@ -104,8 +85,12 @@ impl Conf {
.arg(unix_socket_dir_path.as_os_str())
.arg("-D")
.arg(self.datadir.as_os_str())
.args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
.args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
.args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
.args(&["-c", "shared_preload_libraries=neon"]) // can only be loaded at startup
// Disable background processes as much as possible
.args(&["-c", "wal_writer_delay=10s"])
.args(&["-c", "autovacuum=off"])
.stderr(Stdio::from(log_file))
.spawn()?;
let server = PostgresServer {
@@ -159,7 +144,7 @@ impl PostgresServer {
bail!("Connection timed out");
}
pub fn kill(mut self) {
pub fn kill(&mut self) {
self.process.kill().unwrap();
self.process.wait().unwrap();
}
@@ -196,16 +181,12 @@ pub trait PostgresClientExt: postgres::GenericClient {
impl<C: postgres::GenericClient> PostgresClientExt for C {}
pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> {
fn generate_internal<C: postgres::GenericClient>(
client: &mut C,
f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
) -> Result<PgLsn> {
client.execute("create extension if not exists neon_test_utils", &[])?;
let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0);
ensure!(wal_keep_size == "50MB");
let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0);
ensure!(wal_writer_delay == "10s");
let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0);
ensure!(autovacuum == "off");
let wal_segment_size = client.query_one(
"select cast(setting as bigint) as setting, unit \
from pg_settings where name = 'wal_segment_size'",
@@ -216,160 +197,44 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result
"Unexpected wal_segment_size unit"
);
ensure!(
wal_segment_size.get::<_, i64>("setting") == WAL_SEGMENT_SIZE as i64,
wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024,
"Unexpected wal_segment_size in bytes"
);
Ok(())
}
pub trait Crafter {
const NAME: &'static str;
/// Generates WAL using the client `client`. Returns a pair of:
/// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
/// May include or exclude Lsn(0) and the end-of-wal.
/// * The expected end-of-wal LSN.
fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)>;
}
fn craft_internal<C: postgres::GenericClient>(
client: &mut C,
f: impl Fn(&mut C, PgLsn) -> Result<(Vec<PgLsn>, Option<PgLsn>)>,
) -> Result<(Vec<PgLsn>, PgLsn)> {
ensure_server_config(client)?;
let initial_lsn = client.pg_current_wal_insert_lsn()?;
info!("LSN initial = {}", initial_lsn);
let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
let last_lsn = match last_lsn {
let last_lsn = match f(client, initial_lsn)? {
None => client.pg_current_wal_insert_lsn()?,
Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
Ordering::Less => bail!("Some records were inserted after the generated WAL"),
Ordering::Equal => last_lsn,
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
},
};
if !intermediate_lsns.starts_with(&[initial_lsn]) {
intermediate_lsns.insert(0, initial_lsn);
}
// Some records may be not flushed, e.g. non-transactional logical messages.
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
Ordering::Less => bail!("Some records were flushed after the generated WAL"),
Ordering::Equal => {}
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
}
Ok((intermediate_lsns, last_lsn))
Ok(last_lsn)
}
pub struct Simple;
impl Crafter for Simple {
const NAME: &'static str = "simple";
fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
craft_internal(client, |client, _| {
client.execute("CREATE table t(x int)", &[])?;
Ok((Vec::new(), None))
})
}
}
pub struct LastWalRecordXlogSwitch;
impl Crafter for LastWalRecordXlogSwitch {
const NAME: &'static str = "last_wal_record_xlog_switch";
fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
// Do not use generate_internal because here we end up with flush_lsn exactly on
// the segment boundary and insert_lsn after the initial page header, which is unusual.
ensure_server_config(client)?;
pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
generate_internal(client, |client, _| {
client.execute("CREATE table t(x int)", &[])?;
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
let next_segment = PgLsn::from(0x0200_0000);
ensure!(
after_xlog_switch <= next_segment,
"XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
after_xlog_switch,
next_segment
);
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
}
Ok(None)
})
}
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
// Do not use generate_internal because here we end up with flush_lsn exactly on
// the segment boundary and insert_lsn after the initial page header, which is unusual.
ensure_server_config(client)?;
client.execute("CREATE table t(x int)", &[])?;
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
// We will use logical message as the padding. We start with detecting how much WAL
// it takes for one logical message, considering all alignments and headers.
let base_wal_advance = {
let before_lsn = client.pg_current_wal_insert_lsn()?;
// Small non-empty message bigger than few bytes is more likely than an empty
// message to have the same format as the big padding message.
client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
&[],
)?;
// The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
+ XLOG_SIZE_OF_XLOG_RECORD
};
let mut remaining_lsn =
XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
if remaining_lsn < base_wal_advance {
remaining_lsn += XLOG_BLCKSZ;
}
let repeats = 10 + remaining_lsn - base_wal_advance;
info!(
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
client.pg_current_wal_insert_lsn()?,
remaining_lsn,
base_wal_advance,
repeats
);
client.execute(
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
&[&(repeats as i32)],
)?;
info!(
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
client.pg_current_wal_insert_lsn()?,
XLOG_SIZE_OF_XLOG_RECORD
);
// Emit the XLOG_SWITCH
let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
let next_segment = PgLsn::from(0x0200_0000);
ensure!(
after_xlog_switch < next_segment,
"XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
after_xlog_switch,
next_segment
);
ensure!(
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
"XLOG_SWITCH message ended not on page boundary: {}",
after_xlog_switch
);
Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
}
}
fn craft_single_logical_message(
fn generate_single_logical_message(
client: &mut impl postgres::GenericClient,
transactional: bool,
) -> Result<(Vec<PgLsn>, PgLsn)> {
craft_internal(client, |client, initial_lsn| {
) -> Result<PgLsn> {
generate_internal(client, |client, initial_lsn| {
ensure!(
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
"Initial LSN is too far in the future"
@@ -400,25 +265,21 @@ fn craft_single_logical_message(
message_lsn < after_message_lsn,
"No record found after the emitted message"
);
Ok((vec![message_lsn], Some(after_message_lsn)))
Ok(Some(after_message_lsn))
} else {
Ok((Vec::new(), Some(message_lsn)))
Ok(Some(message_lsn))
}
})
}
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
craft_single_logical_message(client, true)
}
pub fn generate_wal_record_crossing_segment_followed_by_small_one(
client: &mut impl postgres::GenericClient,
) -> Result<PgLsn> {
generate_single_logical_message(client, true)
}
pub struct LastWalRecordCrossingSegment;
impl Crafter for LastWalRecordCrossingSegment {
const NAME: &'static str = "last_wal_record_crossing_segment";
fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
craft_single_logical_message(client, false)
}
pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
client: &mut C,
) -> Result<PgLsn> {
generate_single_logical_message(client, false)
}

View File

@@ -12,10 +12,8 @@ use std::{
borrow::Cow,
collections::HashMap,
ffi::OsStr,
fmt::Debug,
num::{NonZeroU32, NonZeroUsize},
path::{Path, PathBuf},
pin::Pin,
};
use anyhow::{bail, Context};
@@ -42,19 +40,13 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
pub trait RemoteObjectName {
// Needed to retrieve last component for RemoteObjectId.
// In other words a file name
fn object_name(&self) -> Option<&str>;
}
/// Storage (potentially remote) API to manage its state.
/// This storage tries to be unaware of any layered repository context,
/// providing basic CRUD operations for storage files.
#[async_trait::async_trait]
pub trait RemoteStorage: Send + Sync {
/// A way to uniquely reference a file in the remote storage.
type RemoteObjectId: RemoteObjectName;
type RemoteObjectId;
/// Attempts to derive the storage path out of the local path, if the latter is correct.
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
@@ -65,12 +57,6 @@ pub trait RemoteStorage: Send + Sync {
/// Lists all items the storage has right now.
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
/// Lists all top level subdirectories for a given prefix
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
/// Streams the local file contents into remote into the remote storage entry.
async fn upload(
&self,
@@ -84,7 +70,11 @@ pub trait RemoteStorage: Send + Sync {
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
/// Returns the metadata, if any was stored with the file previously.
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError>;
async fn download(
&self,
from: &Self::RemoteObjectId,
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
) -> anyhow::Result<Option<StorageMetadata>>;
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
/// Returns the metadata, if any was stored with the file previously.
@@ -93,49 +83,12 @@ pub trait RemoteStorage: Send + Sync {
from: &Self::RemoteObjectId,
start_inclusive: u64,
end_exclusive: Option<u64>,
) -> Result<Download, DownloadError>;
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
) -> anyhow::Result<Option<StorageMetadata>>;
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
}
pub struct Download {
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
/// Extra key-value data, associated with the current remote file.
pub metadata: Option<StorageMetadata>,
}
impl Debug for Download {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Download")
.field("metadata", &self.metadata)
.finish()
}
}
#[derive(Debug)]
pub enum DownloadError {
/// Validation or other error happened due to user input.
BadInput(anyhow::Error),
/// The file was not found in the remote storage.
NotFound,
/// The file was found in the remote storage, but the download failed.
Other(anyhow::Error),
}
impl std::fmt::Display for DownloadError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
DownloadError::BadInput(e) => {
write!(f, "Failed to download a remote file due to user input: {e}")
}
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"),
}
}
}
impl std::error::Error for DownloadError {}
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
pub enum GenericRemoteStorage {
@@ -227,7 +180,7 @@ pub struct S3Config {
pub concurrency_limit: NonZeroUsize,
}
impl Debug for S3Config {
impl std::fmt::Debug for S3Config {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("S3Config")
.field("bucket_name", &self.bucket_name)

View File

@@ -5,7 +5,6 @@
//! volume is mounted to the local FS.
use std::{
borrow::Cow,
future::Future,
path::{Path, PathBuf},
pin::Pin,
@@ -18,16 +17,10 @@ use tokio::{
};
use tracing::*;
use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName};
use crate::path_with_suffix_extension;
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
impl RemoteObjectName for PathBuf {
fn object_name(&self) -> Option<&str> {
self.file_stem().and_then(|n| n.to_str())
}
}
pub struct LocalFs {
working_directory: PathBuf,
storage_root: PathBuf,
@@ -108,18 +101,7 @@ impl RemoteStorage for LocalFs {
}
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
get_all_files(&self.storage_root, true).await
}
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
let path = match prefix {
Some(prefix) => Cow::Owned(self.storage_root.join(prefix)),
None => Cow::Borrowed(&self.storage_root),
};
get_all_files(path.as_ref(), false).await
get_all_files(&self.storage_root).await
}
async fn upload(
@@ -210,56 +192,14 @@ impl RemoteStorage for LocalFs {
Ok(())
}
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
let file_path = self
.resolve_in_storage(from)
.map_err(DownloadError::BadInput)?;
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
let source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&file_path)
.await
.with_context(|| {
format!(
"Failed to open source file '{}' to use in the download",
file_path.display()
)
})
.map_err(DownloadError::Other)?,
);
let metadata = self
.read_storage_metadata(&file_path)
.await
.map_err(DownloadError::Other)?;
Ok(Download {
metadata,
download_stream: Box::pin(source),
})
} else {
Err(DownloadError::NotFound)
}
}
async fn download_byte_range(
async fn download(
&self,
from: &Self::RemoteObjectId,
start_inclusive: u64,
end_exclusive: Option<u64>,
) -> Result<Download, DownloadError> {
if let Some(end_exclusive) = end_exclusive {
if end_exclusive <= start_inclusive {
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})")));
};
if start_inclusive == end_exclusive.saturating_sub(1) {
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
}
}
let file_path = self
.resolve_in_storage(from)
.map_err(DownloadError::BadInput)?;
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
) -> anyhow::Result<Option<StorageMetadata>> {
let file_path = self.resolve_in_storage(from)?;
if file_path.exists() && file_path.is_file() {
let mut source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
@@ -270,31 +210,81 @@ impl RemoteStorage for LocalFs {
"Failed to open source file '{}' to use in the download",
file_path.display()
)
})
.map_err(DownloadError::Other)?,
})?,
);
io::copy(&mut source, to).await.with_context(|| {
format!(
"Failed to download file '{}' from the local storage",
file_path.display()
)
})?;
source.flush().await?;
self.read_storage_metadata(&file_path).await
} else {
bail!(
"File '{}' either does not exist or is not a file",
file_path.display()
)
}
}
async fn download_byte_range(
&self,
from: &Self::RemoteObjectId,
start_inclusive: u64,
end_exclusive: Option<u64>,
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
) -> anyhow::Result<Option<StorageMetadata>> {
if let Some(end_exclusive) = end_exclusive {
ensure!(
end_exclusive > start_inclusive,
"Invalid range, start ({}) is bigger then end ({:?})",
start_inclusive,
end_exclusive
);
if start_inclusive == end_exclusive.saturating_sub(1) {
return Ok(None);
}
}
let file_path = self.resolve_in_storage(from)?;
if file_path.exists() && file_path.is_file() {
let mut source = io::BufReader::new(
fs::OpenOptions::new()
.read(true)
.open(&file_path)
.await
.with_context(|| {
format!(
"Failed to open source file '{}' to use in the download",
file_path.display()
)
})?,
);
source
.seek(io::SeekFrom::Start(start_inclusive))
.await
.context("Failed to seek to the range start in a local storage file")
.map_err(DownloadError::Other)?;
let metadata = self
.read_storage_metadata(&file_path)
.await
.map_err(DownloadError::Other)?;
.context("Failed to seek to the range start in a local storage file")?;
match end_exclusive {
Some(end_exclusive) => {
io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
}
None => io::copy(&mut source, to).await,
}
.with_context(|| {
format!(
"Failed to download file '{}' range from the local storage",
file_path.display()
)
})?;
Ok(match end_exclusive {
Some(end_exclusive) => Download {
metadata,
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
},
None => Download {
metadata,
download_stream: Box::pin(source),
},
})
self.read_storage_metadata(&file_path).await
} else {
Err(DownloadError::NotFound)
bail!(
"File '{}' either does not exist or is not a file",
file_path.display()
)
}
}
@@ -317,7 +307,6 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
fn get_all_files<'a, P>(
directory_path: P,
recursive: bool,
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
where
P: AsRef<Path> + Send + Sync + 'a,
@@ -334,11 +323,7 @@ where
if file_type.is_symlink() {
debug!("{:?} us a symlink, skipping", entry_path)
} else if file_type.is_dir() {
if recursive {
paths.extend(get_all_files(entry_path, true).await?.into_iter())
} else {
paths.push(dir_entry.path())
}
paths.extend(get_all_files(entry_path).await?.into_iter())
} else {
paths.push(dir_entry.path());
}
@@ -367,19 +352,6 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
Ok(())
}
fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
if file_path.exists() {
ensure!(
file_path.is_file(),
"file path '{}' is not a file",
file_path.display()
);
Ok(true)
} else {
Ok(false)
}
}
#[cfg(test)]
mod pure_tests {
use tempfile::tempdir;
@@ -546,31 +518,6 @@ mod fs_tests {
use std::{collections::HashMap, io::Write};
use tempfile::tempdir;
async fn read_and_assert_remote_file_contents(
storage: &LocalFs,
#[allow(clippy::ptr_arg)]
// have to use &PathBuf due to `storage.local_path` parameter requirements
remote_storage_path: &PathBuf,
expected_metadata: Option<&StorageMetadata>,
) -> anyhow::Result<String> {
let mut download = storage
.download(remote_storage_path)
.await
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
ensure!(
download.metadata.as_ref() == expected_metadata,
"Unexpected metadata returned for the downloaded file"
);
let mut contents = String::new();
download
.download_stream
.read_to_string(&mut contents)
.await
.context("Failed to read remote file contents into string")?;
Ok(contents)
}
#[tokio::test]
async fn upload_file() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();
@@ -621,7 +568,15 @@ mod fs_tests {
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
let metadata = storage.download(&upload_target, &mut content_bytes).await?;
assert!(
metadata.is_none(),
"No metadata should be returned for no metadata upload"
);
content_bytes.flush().await?;
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
assert_eq!(
dummy_contents(upload_name),
contents,
@@ -629,9 +584,13 @@ mod fs_tests {
);
let non_existing_path = PathBuf::from("somewhere").join("else");
match storage.download(&non_existing_path).await {
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
match storage.download(&non_existing_path, &mut io::sink()).await {
Ok(_) => panic!("Should not allow downloading non-existing storage files"),
Err(e) => {
let error_string = e.to_string();
assert!(error_string.contains("does not exist"));
assert!(error_string.contains(&non_existing_path.display().to_string()));
}
}
Ok(())
}
@@ -644,31 +603,58 @@ mod fs_tests {
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
let full_range_download_contents =
read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
let metadata = storage
.download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
.await?;
assert!(
metadata.is_none(),
"No metadata should be returned for no metadata upload"
);
full_range_bytes.flush().await?;
assert_eq!(
dummy_contents(upload_name),
full_range_download_contents,
String::from_utf8(full_range_bytes.into_inner().into_inner())?,
"Download full range should return the whole upload"
);
let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
let same_byte = 1_000_000_000;
let metadata = storage
.download_byte_range(
&upload_target,
same_byte,
Some(same_byte + 1), // exclusive end
&mut zero_range_bytes,
)
.await?;
assert!(
metadata.is_none(),
"No metadata should be returned for no metadata upload"
);
zero_range_bytes.flush().await?;
assert!(
zero_range_bytes.into_inner().into_inner().is_empty(),
"Zero byte range should not download any part of the file"
);
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
let mut first_part_download = storage
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
let metadata = storage
.download_byte_range(
&upload_target,
0,
Some(first_part_local.len() as u64),
&mut first_part_remote,
)
.await?;
assert!(
first_part_download.metadata.is_none(),
metadata.is_none(),
"No metadata should be returned for no metadata upload"
);
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut first_part_download.download_stream,
&mut first_part_remote,
)
.await?;
first_part_remote.flush().await?;
let first_part_remote = first_part_remote.into_inner().into_inner();
assert_eq!(
@@ -677,24 +663,20 @@ mod fs_tests {
"First part bytes should be returned when requested"
);
let mut second_part_download = storage
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
let metadata = storage
.download_byte_range(
&upload_target,
first_part_local.len() as u64,
Some((first_part_local.len() + second_part_local.len()) as u64),
&mut second_part_remote,
)
.await?;
assert!(
second_part_download.metadata.is_none(),
metadata.is_none(),
"No metadata should be returned for no metadata upload"
);
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut second_part_download.download_stream,
&mut second_part_remote,
)
.await?;
second_part_remote.flush().await?;
let second_part_remote = second_part_remote.into_inner().into_inner();
assert_eq!(
@@ -714,30 +696,11 @@ mod fs_tests {
let upload_name = "upload_1";
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
let start = 1_000_000_000;
let end = start + 1;
match storage
.download_byte_range(
&upload_target,
start,
Some(end), // exclusive end
)
.await
{
Ok(_) => panic!("Should not allow downloading wrong ranges"),
Err(e) => {
let error_string = e.to_string();
assert!(error_string.contains("zero bytes"));
assert!(error_string.contains(&start.to_string()));
assert!(error_string.contains(&end.to_string()));
}
}
let start = 10000;
let end = 234;
assert!(start > end, "Should test an incorrect range");
match storage
.download_byte_range(&upload_target, start, Some(end))
.download_byte_range(&upload_target, start, Some(end), &mut io::sink())
.await
{
Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -749,6 +712,18 @@ mod fs_tests {
}
}
let non_existing_path = PathBuf::from("somewhere").join("else");
match storage
.download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
.await
{
Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
Err(e) => {
let error_string = e.to_string();
assert!(error_string.contains("does not exist"));
assert!(error_string.contains(&non_existing_path.display().to_string()));
}
}
Ok(())
}
@@ -787,26 +762,35 @@ mod fs_tests {
let upload_target =
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
let full_range_download_contents =
read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
content_bytes.flush().await?;
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
assert_eq!(
dummy_contents(upload_name),
full_range_download_contents,
contents,
"We should upload and download the same contents"
);
assert_eq!(
full_download_metadata.as_ref(),
Some(&metadata),
"We should get the same metadata back for full download"
);
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
let (first_part_local, _) = uploaded_bytes.split_at(3);
let mut partial_download_with_metadata = storage
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
.await?;
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
io::copy(
&mut partial_download_with_metadata.download_stream,
&mut first_part_remote,
)
.await?;
let partial_download_metadata = storage
.download_byte_range(
&upload_target,
0,
Some(first_part_local.len() as u64),
&mut first_part_remote,
)
.await?;
first_part_remote.flush().await?;
let first_part_remote = first_part_remote.into_inner().into_inner();
assert_eq!(
@@ -816,8 +800,8 @@ mod fs_tests {
);
assert_eq!(
partial_download_with_metadata.metadata,
Some(metadata),
partial_download_metadata.as_ref(),
Some(&metadata),
"We should get the same metadata back for partial download"
);
@@ -859,7 +843,7 @@ mod fs_tests {
}
fn dummy_contents(name: &str) -> String {
format!("contents for {name}")
format!("contents for {}", name)
}
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {

View File

@@ -9,19 +9,17 @@ use std::path::{Path, PathBuf};
use anyhow::Context;
use rusoto_core::{
credential::{InstanceMetadataProvider, StaticProvider},
HttpClient, Region, RusotoError,
HttpClient, Region,
};
use rusoto_s3::{
DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest,
S3Client, StreamingBody, S3,
DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
StreamingBody, S3,
};
use tokio::{io, sync::Semaphore};
use tokio_util::io::ReaderStream;
use tracing::debug;
use crate::{
strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config,
};
use crate::{strip_path_prefix, RemoteStorage, S3Config};
use super::StorageMetadata;
@@ -119,25 +117,6 @@ impl S3ObjectKey {
}
}
impl RemoteObjectName for S3ObjectKey {
/// Turn a/b/c or a/b/c/ into c
fn object_name(&self) -> Option<&str> {
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
// see https://github.com/rust-lang/rust/issues/88674
if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
return None;
}
if self.0.ends_with(S3_PREFIX_SEPARATOR) {
self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
} else {
self.0
.rsplit_once(S3_PREFIX_SEPARATOR)
.map(|(_, last)| last)
}
}
}
/// AWS S3 storage.
pub struct S3Bucket {
workdir: PathBuf,
@@ -208,39 +187,6 @@ impl S3Bucket {
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
})
}
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 download")
.map_err(DownloadError::Other)?;
metrics::inc_get_object();
match self.client.get_object(request).await {
Ok(object_output) => match object_output.body {
None => {
metrics::inc_get_object_fail();
Err(DownloadError::Other(anyhow::anyhow!(
"Got no body for the S3 object given"
)))
}
Some(body) => Ok(Download {
metadata: object_output.metadata.map(StorageMetadata),
download_stream: Box::pin(io::BufReader::new(body.into_async_read())),
}),
},
Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound),
Err(e) => {
metrics::inc_get_object_fail();
Err(DownloadError::Other(anyhow::anyhow!(
"Failed to download S3 object: {e}"
)))
}
}
}
}
#[async_trait::async_trait]
@@ -304,77 +250,6 @@ impl RemoteStorage for S3Bucket {
Ok(document_keys)
}
/// Note: it wont include empty "directories"
async fn list_prefixes(
&self,
prefix: Option<Self::RemoteObjectId>,
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
let list_prefix = match prefix {
Some(prefix) => {
let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default();
// if there is no trailing / in default prefix and
// supplied prefix does not start with "/" insert it
if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR)
|| prefix.0.starts_with(S3_PREFIX_SEPARATOR))
{
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
}
prefix_in_bucket.push_str(&prefix.0);
// required to end with a separator
// otherwise request will return only the entry of a prefix
if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) {
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
}
Some(prefix_in_bucket)
}
None => self.prefix_in_bucket.clone(),
};
let mut document_keys = Vec::new();
let mut continuation_token = None;
loop {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 list")?;
metrics::inc_list_objects();
let fetch_response = self
.client
.list_objects_v2(ListObjectsV2Request {
bucket: self.bucket_name.clone(),
prefix: list_prefix.clone(),
continuation_token,
delimiter: Some(S3_PREFIX_SEPARATOR.to_string()),
..ListObjectsV2Request::default()
})
.await
.map_err(|e| {
metrics::inc_list_objects_fail();
e
})?;
document_keys.extend(
fetch_response
.common_prefixes
.unwrap_or_default()
.into_iter()
.filter_map(|o| Some(S3ObjectKey(o.prefix?))),
);
match fetch_response.continuation_token {
Some(new_token) => continuation_token = Some(new_token),
None => break,
}
}
Ok(document_keys)
}
async fn upload(
&self,
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -408,13 +283,38 @@ impl RemoteStorage for S3Bucket {
Ok(())
}
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: from.key().to_owned(),
..GetObjectRequest::default()
})
.await
async fn download(
&self,
from: &Self::RemoteObjectId,
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
) -> anyhow::Result<Option<StorageMetadata>> {
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 download")?;
metrics::inc_get_object();
let object_output = self
.client
.get_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: from.key().to_owned(),
..GetObjectRequest::default()
})
.await
.map_err(|e| {
metrics::inc_get_object_fail();
e
})?;
if let Some(body) = object_output.body {
let mut from = io::BufReader::new(body.into_async_read());
io::copy(&mut from, to).await?;
}
Ok(object_output.metadata.map(StorageMetadata))
}
async fn download_byte_range(
@@ -422,7 +322,8 @@ impl RemoteStorage for S3Bucket {
from: &Self::RemoteObjectId,
start_inclusive: u64,
end_exclusive: Option<u64>,
) -> Result<Download, DownloadError> {
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
) -> anyhow::Result<Option<StorageMetadata>> {
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
// and needs both ends to be exclusive
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
@@ -430,14 +331,34 @@ impl RemoteStorage for S3Bucket {
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
None => format!("bytes={}-", start_inclusive),
});
let _guard = self
.concurrency_limiter
.acquire()
.await
.context("Concurrency limiter semaphore got closed during S3 range download")?;
self.download_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: from.key().to_owned(),
range,
..GetObjectRequest::default()
})
.await
metrics::inc_get_object();
let object_output = self
.client
.get_object(GetObjectRequest {
bucket: self.bucket_name.clone(),
key: from.key().to_owned(),
range,
..GetObjectRequest::default()
})
.await
.map_err(|e| {
metrics::inc_get_object_fail();
e
})?;
if let Some(body) = object_output.body {
let mut from = io::BufReader::new(body.into_async_read());
io::copy(&mut from, to).await?;
}
Ok(object_output.metadata.map(StorageMetadata))
}
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
@@ -470,25 +391,6 @@ mod tests {
use super::*;
#[test]
fn object_name() {
let k = S3ObjectKey("a/b/c".to_owned());
assert_eq!(k.object_name(), Some("c"));
let k = S3ObjectKey("a/b/c/".to_owned());
assert_eq!(k.object_name(), Some("c"));
let k = S3ObjectKey("a/".to_owned());
assert_eq!(k.object_name(), Some("a"));
// XXX is it impossible to have an empty key?
let k = S3ObjectKey("".to_owned());
assert_eq!(k.object_name(), None);
let k = S3ObjectKey("/".to_owned());
assert_eq!(k.object_name(), None);
}
#[test]
fn download_destination() -> anyhow::Result<()> {
let workdir = tempdir()?.path().to_owned();

View File

@@ -537,13 +537,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
match tenant_match.subcommand() {
Some(("list", _)) => {
for t in pageserver.tenant_list()? {
println!(
"{} {}",
t.id,
t.state
.map(|s| s.to_string())
.unwrap_or_else(|| String::from(""))
);
println!("{} {}", t.id, t.state);
}
}
Some(("create", create_match)) => {

View File

@@ -1,4 +1,15 @@
# Services
## Page server architecture
The Page Server has a few different duties:
- Respond to GetPage@LSN requests from the Compute Nodes
- Receive WAL from WAL safekeeper
- Replay WAL that's applicable to the chunks that the Page Server maintains
- Backup to S3
S3 is the main fault-tolerant storage of all data, as there are no Page Server
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
keeps track of WAL records which are not synced to S3 yet.
The Page Server consists of multiple threads that operate on a shared
repository of page versions:
@@ -10,22 +21,18 @@ repository of page versions:
| WAL receiver |
| |
+--------------+
......
+---------+ +--------+ . .
| | | | . .
GetPage@LSN | | | backup | -------> . S3 .
-------------> | Page | repository | | . .
| Service | +--------+ . .
page | | ......
+----+
+---------+ .......... | |
| | . . | |
GetPage@LSN | | . backup . -------> | S3 |
-------------> | Page | repository . . | |
| Service | .......... | |
page | | +----+
<------------- | |
+---------+ +-----------+ +--------------------+
| WAL redo | | Checkpointing, |
+----------+ | processes | | Garbage collection |
| | +-----------+ +--------------------+
| HTTP |
| mgmt API |
| |
+----------+
+---------+ +--------------------+
| Checkpointing / |
| Garbage collection |
+--------------------+
Legend:
@@ -33,77 +40,28 @@ Legend:
| | A thread or multi-threaded service
+--+
....
. . Component at its early development phase.
....
---> Data flow
<---
```
## Page Service
Page Service
------------
The Page Service listens for GetPage@LSN requests from the Compute Nodes,
and responds with pages from the repository. On each GetPage@LSN request,
it calls into the Repository function
A separate thread is spawned for each incoming connection to the page
service. The page service uses the libpq protocol to communicate with
the client. The client is a Compute Postgres instance.
## WAL Receiver
The WAL receiver connects to the external WAL safekeeping service
using PostgreSQL physical streaming replication, and continuously
receives WAL. It decodes the WAL records, and stores them to the
repository.
and responds with pages from the repository.
## Backup service
WAL Receiver
------------
The backup service, responsible for storing pageserver recovery data externally.
Currently, pageserver stores its files in a filesystem directory it's pointed to.
That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
Therefore, the server interacts with external, more reliable storage to back up and restore its state.
The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
There are the following implementations present:
* local filesystem — to use in tests mainly
* AWS S3 - to use in production
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
The backup service is disabled by default and can be enabled to interact with a single remote storage.
CLI examples:
* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
For local S3 installations, refer to the their documentation for name format and credentials.
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
Required sections are:
```toml
[remote_storage]
local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
```
or
```toml
[remote_storage]
bucket_name = 'some-sample-bucket'
bucket_region = 'eu-north-1'
prefix_in_bucket = '/test_prefix/'
```
`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.
## Repository background tasks
The Repository also has a few different background threads and tokio tasks that perform
background duties like dumping accumulated WAL data from memory to disk, reorganizing
files for performance (compaction), and garbage collecting old files.
The WAL receiver connects to the external WAL safekeeping service (or
directly to the primary) using PostgreSQL physical streaming
replication, and continuously receives WAL. It decodes the WAL records,
and stores them to the repository.
Repository
@@ -158,6 +116,48 @@ Remove old on-disk layer files that are no longer needed according to the
PITR retention policy
### Backup service
The backup service, responsible for storing pageserver recovery data externally.
Currently, pageserver stores its files in a filesystem directory it's pointed to.
That working directory could be rather ephemeral for such cases as "a pageserver pod running in k8s with no persistent volumes attached".
Therefore, the server interacts with external, more reliable storage to back up and restore its state.
The code for storage support is extensible and can support arbitrary ones as long as they implement a certain Rust trait.
There are the following implementations present:
* local filesystem — to use in tests mainly
* AWS S3 - to use in production
Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
The backup service is disabled by default and can be enabled to interact with a single remote storage.
CLI examples:
* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
For local S3 installations, refer to the their documentation for name format and credentials.
Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
Required sections are:
```toml
[remote_storage]
local_path = '/Users/someonetoignore/Downloads/tmp_dir/'
```
or
```toml
[remote_storage]
bucket_name = 'some-sample-bucket'
bucket_region = 'eu-north-1'
prefix_in_bucket = '/test_prefix/'
```
`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.
TODO: Sharding
--------------------

View File

@@ -60,7 +60,6 @@ where
write: W,
timeline: &'a Arc<DatadirTimelineImpl>,
req_lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
full_backup: bool,
) -> Result<Basebackup<'a, W>> {
// Compute postgres doesn't have any previous WAL files, but the first
@@ -97,26 +96,16 @@ where
(end_of_timeline.prev, end_of_timeline.last)
};
// Consolidate the derived and the provided prev_lsn values
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
if backup_prev != Lsn(0) {
ensure!(backup_prev == provided_prev_lsn)
}
provided_prev_lsn
} else {
backup_prev
};
info!(
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
backup_lsn, prev_lsn, full_backup
backup_lsn, backup_prev, full_backup
);
Ok(Basebackup {
ar: Builder::new(AbortableWrite::new(write)),
timeline,
lsn: backup_lsn,
prev_record_lsn: prev_lsn,
prev_record_lsn: backup_prev,
full_backup,
finished: false,
})

View File

@@ -22,49 +22,6 @@ paths:
properties:
id:
type: integer
/v1/tenant/{tenant_id}:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
get:
description: Get tenant status
responses:
"200":
description: Currently returns the flag whether the tenant has inprogress timeline downloads
content:
application/json:
schema:
$ref: "#/components/schemas/TenantInfo"
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline:
parameters:
- name: tenant_id
@@ -113,7 +70,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
parameters:
- name: tenant_id
@@ -128,14 +84,13 @@ paths:
schema:
type: string
format: hex
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
get:
description: Get info about the timeline
parameters:
- name: include-non-incremental-logical-size
in: query
schema:
type: string
description: Controls calculation of current_logical_size_non_incremental
responses:
"200":
description: TimelineInfo
@@ -167,35 +122,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/Error"
delete:
description: "Attempts to delete specified timeline. On 500 errors should be retried"
responses:
"200":
description: Ok
"400":
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
parameters:
@@ -245,7 +171,7 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/attach:
/v1/tenant/{tenant_id}/timeline/{timeline_id}/attach:
parameters:
- name: tenant_id
in: path
@@ -253,13 +179,19 @@ paths:
schema:
type: string
format: hex
- name: timeline_id
in: path
required: true
schema:
type: string
format: hex
post:
description: Schedules attach operation to happen in the background for given tenant
description: Attach remote timeline
responses:
"202":
description: Tenant attaching scheduled
"200":
description: Timeline attaching scheduled
"400":
description: Error when no tenant id found in path parameters
description: Error when no tenant id found in path or no timeline id
content:
application/json:
schema:
@@ -283,7 +215,7 @@ paths:
schema:
$ref: "#/components/schemas/NotFoundError"
"409":
description: Tenant download is already in progress
description: Timeline download is already in progress
content:
application/json:
schema:
@@ -295,6 +227,7 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach:
parameters:
- name: tenant_id
@@ -310,11 +243,10 @@ paths:
type: string
format: hex
post:
description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead
deprecated: true
description: Detach local timeline
responses:
"200":
description: Ok
description: Timeline detached
"400":
description: Error when no tenant id found in path or no timeline id
content:
@@ -340,43 +272,6 @@ paths:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/detach:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
format: hex
post:
description: Detach local tenant
responses:
"200":
description: Tenant detached
"400":
description: Error when no tenant id found in path parameters
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"401":
description: Unauthorized Error
content:
application/json:
schema:
$ref: "#/components/schemas/UnauthorizedError"
"403":
description: Forbidden Error
content:
application/json:
schema:
$ref: "#/components/schemas/ForbiddenError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
/v1/tenant/{tenant_id}/timeline/:
parameters:
@@ -572,13 +467,12 @@ components:
type: object
required:
- id
- state
properties:
id:
type: string
state:
type: string
has_in_progress_downloads:
type: boolean
TenantCreateInfo:
type: object
properties:
@@ -673,7 +567,6 @@ components:
type: integer
current_logical_size_non_incremental:
type: integer
WalReceiverEntry:
type: object
required:

View File

@@ -14,7 +14,6 @@ use crate::repository::Repository;
use crate::storage_sync;
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
use crate::tenant_config::TenantConfOpt;
use crate::tenant_mgr::TenantInfo;
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
use crate::{config::PageServerConf, tenant_mgr, timelines};
use utils::{
@@ -210,9 +209,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
.await;
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
return Err(ApiError::NotFound(format!(
"Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
)));
return Err(ApiError::NotFound(
"Timeline is not found neither locally nor remotely".to_string(),
));
}
let timeline_info = TimelineInfo {
@@ -242,157 +241,123 @@ async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Bod
json_response(StatusCode::OK, &wal_receiver_entry)
}
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
info!("Handling tenant attach {}", tenant_id,);
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
info!(
"Handling timeline {} attach for tenant: {}",
timeline_id, tenant_id,
);
tokio::task::spawn_blocking(move || {
if tenant_mgr::get_tenant_state(tenant_id).is_some() {
anyhow::bail!("Tenant is already present locally")
if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() {
// TODO: maybe answer with 309 Not Modified here?
anyhow::bail!("Timeline is already present locally")
};
Ok(())
})
.await
.map_err(ApiError::from_err)??;
let sync_id = ZTenantTimelineId {
tenant_id,
timeline_id,
};
let state = get_state(&request);
let remote_index = &state.remote_index;
let mut index_accessor = remote_index.write().await;
if let Some(tenant_entry) = index_accessor.tenant_entry_mut(&tenant_id) {
if tenant_entry.has_in_progress_downloads() {
if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) {
if remote_timeline.awaits_download {
return Err(ApiError::Conflict(
"Tenant download is already in progress".to_string(),
"Timeline download is already in progress".to_string(),
));
}
for (timeline_id, remote_timeline) in tenant_entry.iter_mut() {
storage_sync::schedule_layer_download(tenant_id, *timeline_id);
remote_timeline.awaits_download = true;
}
return json_response(StatusCode::ACCEPTED, ());
}
// no tenant in the index, release the lock to make the potentially lengthy download opetation
drop(index_accessor);
// download index parts for every tenant timeline
let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await {
Ok(Some(remote_timelines)) => remote_timelines,
Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())),
Err(e) => {
error!("Failed to retrieve remote tenant data: {:?}", e);
return Err(ApiError::NotFound(
"Failed to retrieve remote tenant".to_string(),
));
}
};
// recheck that download is not in progress because
// we've released the lock to avoid holding it during the download
let mut index_accessor = remote_index.write().await;
let tenant_entry = match index_accessor.tenant_entry_mut(&tenant_id) {
Some(tenant_entry) => {
if tenant_entry.has_in_progress_downloads() {
return Err(ApiError::Conflict(
"Tenant download is already in progress".to_string(),
));
}
tenant_entry
}
None => index_accessor.add_tenant_entry(tenant_id),
};
// populate remote index with the data from index part and create directories on the local filesystem
for (timeline_id, mut remote_timeline) in remote_timelines {
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
.await
.context("Failed to create new timeline directory")?;
remote_timeline.awaits_download = true;
tenant_entry.insert(timeline_id, remote_timeline);
// schedule actual download
storage_sync::schedule_layer_download(tenant_id, timeline_id);
return json_response(StatusCode::ACCEPTED, ());
} else {
// no timeline in the index, release the lock to make the potentially lengthy download opetation
drop(index_accessor);
}
let new_timeline = match try_download_index_part_data(state, sync_id).await {
Ok(Some(mut new_timeline)) => {
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
.await
.context("Failed to create new timeline directory")?;
new_timeline.awaits_download = true;
new_timeline
}
Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())),
Err(e) => {
error!("Failed to retrieve remote timeline data: {:?}", e);
return Err(ApiError::NotFound(
"Failed to retrieve remote timeline".to_string(),
));
}
};
let mut index_accessor = remote_index.write().await;
match index_accessor.timeline_entry_mut(&sync_id) {
Some(remote_timeline) => {
if remote_timeline.awaits_download {
return Err(ApiError::Conflict(
"Timeline download is already in progress".to_string(),
));
}
remote_timeline.awaits_download = true;
}
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
}
storage_sync::schedule_layer_download(tenant_id, timeline_id);
json_response(StatusCode::ACCEPTED, ())
}
/// Note: is expensive from s3 access perspective,
/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts`
async fn gather_tenant_timelines_index_parts(
async fn try_download_index_part_data(
state: &State,
tenant_id: ZTenantId,
) -> anyhow::Result<Option<Vec<(ZTimelineId, RemoteTimeline)>>> {
let index_parts = match state.remote_storage.as_ref() {
sync_id: ZTenantTimelineId,
) -> anyhow::Result<Option<RemoteTimeline>> {
let index_part = match state.remote_storage.as_ref() {
Some(GenericRemoteStorage::Local(local_storage)) => {
storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id)
.await
storage_sync::download_index_part(state.conf, local_storage, sync_id).await
}
// FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones
// because it is a different instance. We can move this limit to some global static
// or use one instance everywhere.
Some(GenericRemoteStorage::S3(s3_storage)) => {
storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id)
.await
storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
}
None => return Ok(None),
}
.with_context(|| format!("Failed to download index parts for tenant {tenant_id}"))?;
.with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;
let mut remote_timelines = Vec::with_capacity(index_parts.len());
for (timeline_id, index_part) in index_parts {
let timeline_path = state.conf.timeline_path(&timeline_id, &tenant_id);
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
.with_context(|| {
format!("Failed to convert index part into remote timeline for timeline {tenant_id}/{timeline_id}")
})?;
remote_timelines.push((timeline_id, remote_timeline));
}
Ok(Some(remote_timelines))
let timeline_path = state
.conf
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
RemoteTimeline::from_index_part(&timeline_path, index_part)
.map(Some)
.with_context(|| {
format!("Failed to convert index part into remote timeline for timeline {sync_id}")
})
}
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
let state = get_state(&request);
tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
tenant_mgr::delete_timeline(tenant_id, timeline_id)
let _enter =
info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
.entered();
let state = get_state(&request);
tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id)
})
.await
.map_err(ApiError::from_err)??;
let mut remote_index = state.remote_index.write().await;
remote_index.remove_timeline_entry(ZTenantTimelineId {
tenant_id,
timeline_id,
});
json_response(StatusCode::OK, ())
}
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
let conf = state.conf;
tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
tenant_mgr::detach_tenant(conf, tenant_id)
})
.await
.map_err(ApiError::from_err)??;
let mut remote_index = state.remote_index.write().await;
remote_index.remove_tenant_entry(&tenant_id);
json_response(StatusCode::OK, ())
}
@@ -400,13 +365,9 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
// check for management permission
check_permission(&request, None)?;
let state = get_state(&request);
// clone to avoid holding the lock while awaiting for blocking task
let remote_index = state.remote_index.read().await.clone();
let response_data = tokio::task::spawn_blocking(move || {
let _enter = info_span!("tenant_list").entered();
crate::tenant_mgr::list_tenants(&remote_index)
crate::tenant_mgr::list_tenants()
})
.await
.map_err(ApiError::from_err)?;
@@ -414,34 +375,6 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
json_response(StatusCode::OK, response_data)
}
async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
// if tenant is in progress of downloading it can be absent in global tenant map
let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id))
.await
.map_err(ApiError::from_err)?;
let state = get_state(&request);
let remote_index = &state.remote_index;
let index_accessor = remote_index.read().await;
let has_in_progress_downloads = index_accessor
.tenant_entry(&tenant_id)
.ok_or_else(|| ApiError::NotFound("Tenant not found in remote index".to_string()))?
.has_in_progress_downloads();
json_response(
StatusCode::OK,
TenantInfo {
id: tenant_id,
state: tenant_state,
has_in_progress_downloads: Some(has_in_progress_downloads),
},
)
}
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
// check for management permission
check_permission(&request, None)?;
@@ -587,28 +520,24 @@ pub fn make_router(
.get("/v1/status", status_handler)
.get("/v1/tenant", tenant_list_handler)
.post("/v1/tenant", tenant_create_handler)
.get("/v1/tenant/:tenant_id", tenant_status)
.put("/v1/tenant/config", tenant_config_handler)
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_detail_handler,
)
.delete(
"/v1/tenant/:tenant_id/timeline/:timeline_id",
timeline_delete_handler,
)
// for backward compatibility
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
timeline_delete_handler,
)
.get(
"/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
wal_receiver_get_handler,
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
timeline_attach_handler,
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
timeline_detach_handler,
)
.any(handler_404))
}

View File

@@ -57,7 +57,6 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
pg_control = Some(control_file);
}
modification.flush()?;
}
}
@@ -318,7 +317,6 @@ pub fn import_basebackup_from_tar<R: Repository, Reader: Read>(
// We found the pg_control file.
pg_control = Some(res);
}
modification.flush()?;
}
tar::EntryType::Directory => {
debug!("directory {:?}", file_path);
@@ -518,23 +516,10 @@ pub fn import_file<R: Repository, Reader: Read>(
// Parse zenith signal file to set correct previous LSN
let bytes = read_all_bytes(reader)?;
// zenith.signal format is "PREV LSN: prev_lsn"
// TODO write serialization and deserialization in the same place.
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
let prev_lsn = match zenith_signal {
"PREV LSN: none" => Lsn(0),
"PREV LSN: invalid" => Lsn(0),
other => {
let split = other.split(':').collect::<Vec<_>>();
split[1]
.trim()
.parse::<Lsn>()
.context("can't parse zenith.signal")?
}
};
let zenith_signal = std::str::from_utf8(&bytes)?;
let zenith_signal = zenith_signal.split(':').collect::<Vec<_>>();
let prev_lsn = zenith_signal[1].trim().parse::<Lsn>()?;
// zenith.signal is not necessarily the last file, that we handle
// but it is ok to call `finish_write()`, because final `modification.commit()`
// will update lsn once more to the final one.
let writer = modification.tline.tline.writer();
writer.finish_write(prev_lsn);

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +1,4 @@
# Pageserver storage
# Overview
The main responsibility of the Page Server is to process the incoming WAL, and
reprocess it into a format that allows reasonably quick access to any page

View File

@@ -34,7 +34,7 @@ pub trait BlobCursor {
) -> Result<(), std::io::Error>;
}
impl<R> BlobCursor for BlockCursor<R>
impl<'a, R> BlobCursor for BlockCursor<R>
where
R: BlockReader,
{

View File

@@ -445,10 +445,7 @@ impl ImageLayerWriter {
},
);
info!("new image layer {}", path.display());
let mut file = VirtualFile::open_with_options(
&path,
std::fs::OpenOptions::new().write(true).create_new(true),
)?;
let mut file = VirtualFile::create(&path)?;
// make room for the header block
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);

View File

@@ -267,13 +267,13 @@ impl InMemoryLayer {
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
/// Adds the page version to the in-memory tree
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
let mut inner = self.inner.write().unwrap();
inner.assert_writeable();
let off = inner.file.write_blob(&Value::ser(val)?)?;
let off = inner.file.write_blob(&Value::ser(&val)?)?;
let vec_map = inner.index.entry(key).or_default();
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;

View File

@@ -554,7 +554,7 @@ impl PageServerHandler {
// Create empty timeline
info!("creating new timeline");
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
let timeline = repo.create_empty_timeline(timeline_id, Lsn(0))?;
let repartition_distance = repo.get_checkpoint_distance();
let mut datadir_timeline =
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
@@ -772,7 +772,6 @@ impl PageServerHandler {
pgb: &mut PostgresBackend,
timelineid: ZTimelineId,
lsn: Option<Lsn>,
prev_lsn: Option<Lsn>,
tenantid: ZTenantId,
full_backup: bool,
) -> anyhow::Result<()> {
@@ -797,8 +796,7 @@ impl PageServerHandler {
{
let mut writer = CopyDataSink { pgb };
let basebackup =
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, full_backup)?;
span.record("lsn", &basebackup.lsn.to_string().as_str());
basebackup.send_tarball()?;
}
@@ -901,67 +899,33 @@ impl postgres_backend::Handler for PageServerHandler {
};
// Check that the timeline exists
self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?;
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, false)?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
}
// return pair of prev_lsn and last_lsn
else if query_string.starts_with("get_last_record_rlsn ") {
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() == 2,
"invalid param number for get_last_record_rlsn command"
);
let tenantid = ZTenantId::from_str(params[0])?;
let timelineid = ZTimelineId::from_str(params[1])?;
self.check_permission(Some(tenantid))?;
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
.context("Cannot load local timeline")?;
let end_of_timeline = timeline.tline.get_last_record_rlsn();
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::text_col(b"prev_lsn"),
RowDescriptor::text_col(b"last_lsn"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(end_of_timeline.prev.to_string().as_bytes()),
Some(end_of_timeline.last.to_string().as_bytes()),
]))?
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
}
// same as basebackup, but result includes relational data as well
else if query_string.starts_with("fullbackup ") {
let (_, params_raw) = query_string.split_at("fullbackup ".len());
let params = params_raw.split_whitespace().collect::<Vec<_>>();
ensure!(
params.len() >= 2,
params.len() == 3,
"invalid param number for fullbackup command"
);
let tenantid = ZTenantId::from_str(params[0])?;
let timelineid = ZTimelineId::from_str(params[1])?;
// The caller is responsible for providing correct lsn and prev_lsn.
let lsn = if params.len() > 2 {
Some(Lsn::from_str(params[2])?)
} else {
None
};
let prev_lsn = if params.len() > 3 {
Some(Lsn::from_str(params[3])?)
} else {
None
};
self.check_permission(Some(tenantid))?;
// Lsn is required for fullbackup, because otherwise we would not know
// at which lsn to upload this backup.
//
// The caller is responsible for providing a valid lsn
// and using it in the subsequent import.
let lsn = Some(Lsn::from_str(params[2])?);
// Check that the timeline exists
self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?;
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, true)?;
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else if query_string.starts_with("import basebackup ") {
// Import the `base` section (everything but the wal) of a basebackup.
@@ -987,10 +951,7 @@ impl postgres_backend::Handler for PageServerHandler {
match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) {
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
}
Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
};
} else if query_string.starts_with("import wal ") {
// Import the `pg_wal` section of a basebackup.
@@ -1009,10 +970,7 @@ impl postgres_backend::Handler for PageServerHandler {
match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) {
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
Err(e) => {
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
}
Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
};
} else if query_string.to_ascii_lowercase().starts_with("set ") {
// important because psycopg2 executes "SET datestyle TO 'ISO'"
@@ -1193,7 +1151,6 @@ impl postgres_backend::Handler for PageServerHandler {
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
LsnForTimestamp::Future(_lsn) => "future".into(),
LsnForTimestamp::Past(_lsn) => "past".into(),
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
};
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;

View File

@@ -51,7 +51,6 @@ pub enum LsnForTimestamp {
Present(Lsn),
Future(Lsn),
Past(Lsn),
NoData(Lsn),
}
impl<R: Repository> DatadirTimeline<R> {
@@ -264,7 +263,7 @@ impl<R: Repository> DatadirTimeline<R> {
(false, false) => {
// This can happen if no commit records have been processed yet, e.g.
// just after importing a cluster.
Ok(LsnForTimestamp::NoData(max_lsn))
bail!("no commit timestamps found");
}
(true, false) => {
// Didn't find any commit timestamps larger than the request
@@ -902,57 +901,6 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
Ok(())
}
///
/// Flush changes accumulated so far to the underlying repository.
///
/// Usually, changes made in DatadirModification are atomic, but this allows
/// you to flush them to the underlying repository before the final `commit`.
/// That allows to free up the memory used to hold the pending changes.
///
/// Currently only used during bulk import of a data directory. In that
/// context, breaking the atomicity is OK. If the import is interrupted, the
/// whole import fails and the timeline will be deleted anyway.
/// (Or to be precise, it will be left behind for debugging purposes and
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
///
/// Note: A consequence of flushing the pending operations is that they
/// won't be visible to subsequent operations until `commit`. The function
/// retains all the metadata, but data pages are flushed. That's again OK
/// for bulk import, where you are just loading data pages and won't try to
/// modify the same pages twice.
pub fn flush(&mut self) -> Result<()> {
// Unless we have accumulated a decent amount of changes, it's not worth it
// to scan through the pending_updates list.
let pending_nblocks = self.pending_nblocks;
if pending_nblocks < 10000 {
return Ok(());
}
let writer = self.tline.tline.writer();
// Flush relation and SLRU data blocks, keep metadata.
let mut result: Result<()> = Ok(());
self.pending_updates.retain(|&key, value| {
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
result = writer.put(key, self.lsn, value);
false
} else {
true
}
});
result?;
if pending_nblocks != 0 {
self.tline.current_logical_size.fetch_add(
pending_nblocks * pg_constants::BLCKSZ as isize,
Ordering::SeqCst,
);
self.pending_nblocks = 0;
}
Ok(())
}
///
/// Finish this atomic update, writing all the updated keys to the
/// underlying timeline.
@@ -963,7 +911,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
let pending_nblocks = self.pending_nblocks;
for (key, value) in self.pending_updates {
writer.put(key, self.lsn, &value)?;
writer.put(key, self.lsn, value)?;
}
for key_range in self.pending_deletions {
writer.delete(key_range.clone(), self.lsn)?;
@@ -1368,10 +1316,6 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
})
}
fn is_rel_block_key(key: Key) -> bool {
key.field1 == 0x00 && key.field4 != 0
}
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
Ok(match key.field1 {
0x01 => {
@@ -1390,12 +1334,6 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
})
}
fn is_slru_block_key(key: Key) -> bool {
key.field1 == 0x01 // SLRU-related
&& key.field3 == 0x00000001 // but not SlruDir
&& key.field6 != 0xffffffff // and not SlruSegSize
}
//
//-- Tests that should work the same with any Repository/Timeline implementation.
//

View File

@@ -81,12 +81,6 @@ mod profiling_impl {
pub struct DummyProfilerGuard;
impl Drop for DummyProfilerGuard {
fn drop(&mut self) {
// do nothing, this exists to calm Clippy down
}
}
pub fn profpoint_start(
_conf: &PageServerConf,
_point: ProfilingConfig,

View File

@@ -7,6 +7,7 @@ use byteorder::{ByteOrder, BE};
use bytes::Bytes;
use serde::{Deserialize, Serialize};
use std::fmt;
use std::fmt::Display;
use std::ops::{AddAssign, Range};
use std::sync::{Arc, RwLockReadGuard};
use std::time::Duration;
@@ -181,6 +182,20 @@ impl Value {
}
}
#[derive(Clone, Copy, Debug)]
pub enum TimelineSyncStatusUpdate {
Downloaded,
}
impl Display for TimelineSyncStatusUpdate {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
TimelineSyncStatusUpdate::Downloaded => "Downloaded",
};
f.write_str(s)
}
}
///
/// A repository corresponds to one .neon directory. One repository holds multiple
/// timelines, forked off from the same initial call to 'initdb'.
@@ -189,7 +204,11 @@ pub trait Repository: Send + Sync {
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
/// See [`crate::remote_storage`] for more details about the synchronization.
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
fn apply_timeline_remote_sync_status_update(
&self,
timeline_id: ZTimelineId,
timeline_sync_status_update: TimelineSyncStatusUpdate,
) -> Result<()>;
/// Get Timeline handle for given zenith timeline ID.
/// This function is idempotent. It doesn't change internal state in any way.
@@ -206,17 +225,12 @@ pub trait Repository: Send + Sync {
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
fn create_empty_timeline(
&self,
timeline_id: ZTimelineId,
timelineid: ZTimelineId,
initdb_lsn: Lsn,
) -> Result<Arc<Self::Timeline>>;
/// Branch a timeline
fn branch_timeline(
&self,
src: ZTimelineId,
dst: ZTimelineId,
start_lsn: Option<Lsn>,
) -> Result<()>;
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
/// Flush all data to disk.
///
@@ -246,10 +260,10 @@ pub trait Repository: Send + Sync {
/// api's 'compact' command.
fn compaction_iteration(&self) -> Result<()>;
/// removes timeline-related in-memory data
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>;
/// detaches timeline-related in-memory data.
fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
/// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
fn get_remote_index(&self) -> &RemoteIndex;
}
@@ -393,7 +407,7 @@ pub trait TimelineWriter<'a> {
///
/// This will implicitly extend the relation, if the page is beyond the
/// current end-of-file.
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>;
fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()>;
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;
@@ -523,7 +537,7 @@ pub mod repo_harness {
TenantConfOpt::from(self.tenant_conf),
walredo_mgr,
self.tenant_id,
RemoteIndex::default(),
RemoteIndex::empty(),
false,
);
// populate repo with locally available timelines
@@ -539,7 +553,10 @@ pub mod repo_harness {
.parse()
.unwrap();
repo.attach_timeline(timeline_id)?;
repo.apply_timeline_remote_sync_status_update(
timeline_id,
TimelineSyncStatusUpdate::Downloaded,
)?;
}
Ok(repo)
@@ -603,12 +620,12 @@ mod tests {
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?;
writer.finish_write(Lsn(0x10));
drop(writer);
let writer = tline.writer();
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?;
writer.finish_write(Lsn(0x20));
drop(writer);
@@ -619,19 +636,6 @@ mod tests {
Ok(())
}
#[test]
fn no_duplicate_timelines() -> Result<()> {
let repo = RepoHarness::create("no_duplicate_timelines")?.load();
let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
Ok(_) => panic!("duplicate timeline creation should fail"),
Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
}
Ok(())
}
/// Convenience function to create a page image with given string as the only content
pub fn test_value(s: &str) -> Value {
let mut buf = BytesMut::new();
@@ -655,24 +659,24 @@ mod tests {
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
// Insert a value on the timeline
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?;
writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?;
writer.finish_write(Lsn(0x20));
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?;
writer.finish_write(Lsn(0x30));
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?;
writer.finish_write(Lsn(0x40));
//assert_current_logical_size(&tline, Lsn(0x40));
// Branch the history, modify relation differently on the new timeline
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
let new_writer = newtline.writer();
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?;
new_writer.finish_write(Lsn(0x40));
// Check page contents on both branches
@@ -703,14 +707,14 @@ mod tests {
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
@@ -721,14 +725,14 @@ mod tests {
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
lsn += 0x10;
writer.put(
*TEST_KEY,
lsn,
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
)?;
writer.finish_write(lsn);
}
@@ -749,7 +753,7 @@ mod tests {
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
// try to branch at lsn 25, should fail because we already garbage collected the data
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
Ok(_) => panic!("branching should have failed"),
Err(err) => {
assert!(err.to_string().contains("invalid branch start lsn"));
@@ -770,7 +774,7 @@ mod tests {
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
Ok(_) => panic!("branching should have failed"),
Err(err) => {
assert!(&err.to_string().contains("invalid branch start lsn"));
@@ -815,7 +819,7 @@ mod tests {
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
@@ -831,7 +835,7 @@ mod tests {
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
make_some_layers(tline.as_ref(), Lsn(0x20))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)
.expect("Should have a local timeline");
@@ -889,7 +893,7 @@ mod tests {
make_some_layers(tline.as_ref(), Lsn(0x20))?;
tline.checkpoint(CheckpointConfig::Forced)?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
let newtline = repo
.get_timeline_load(NEW_TIMELINE_ID)

View File

@@ -178,8 +178,9 @@ use crate::{
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
LayeredRepository,
},
repository::TimelineSyncStatusUpdate,
storage_sync::{self, index::RemoteIndex},
tenant_mgr::attach_downloaded_tenants,
tenant_mgr::apply_timeline_sync_status_updates,
thread_mgr,
thread_mgr::ThreadKind,
};
@@ -190,8 +191,7 @@ use metrics::{
};
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
use self::download::download_index_parts;
pub use self::download::gather_tenant_timelines_index_parts;
pub use self::download::download_index_part;
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
lazy_static! {
@@ -301,7 +301,7 @@ pub fn start_local_timeline_sync(
}
Ok(SyncStartupData {
local_timeline_init_statuses,
remote_index: RemoteIndex::default(),
remote_index: RemoteIndex::empty(),
})
}
}
@@ -835,7 +835,7 @@ where
.build()
.context("Failed to create storage sync runtime")?;
let applicable_index_parts = runtime.block_on(download_index_parts(
let applicable_index_parts = runtime.block_on(try_fetch_index_parts(
conf,
&storage,
local_timeline_files.keys().copied().collect(),
@@ -918,48 +918,16 @@ fn storage_sync_loop<P, S>(
});
match loop_step {
ControlFlow::Continue(updated_tenants) => {
if updated_tenants.is_empty() {
debug!("Sync loop step completed, no new tenant states");
ControlFlow::Continue(new_timeline_states) => {
if new_timeline_states.is_empty() {
debug!("Sync loop step completed, no new timeline states");
} else {
info!(
"Sync loop step completed, {} new tenant state update(s)",
updated_tenants.len()
"Sync loop step completed, {} new timeline state update(s)",
new_timeline_states.len()
);
let mut sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>> =
HashMap::new();
let index_accessor = runtime.block_on(index.read());
for tenant_id in updated_tenants {
let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
Some(tenant_entry) => tenant_entry,
None => {
error!(
"cannot find tenant in remote index for timeline sync update"
);
continue;
}
};
if tenant_entry.has_in_progress_downloads() {
info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration");
continue;
} else {
info!(
"Tenant {tenant_id} download completed. Picking to register in repository"
);
// Here we assume that if tenant has no in-progress downloads that
// means that it is the last completed timeline download that triggered
// sync status update. So we look at the index for available timelines
// and register them all at once in a repository for download
// to be submitted in a single operation to repository
// so it can apply them at once to internal timeline map.
sync_status_updates
.insert(tenant_id, tenant_entry.keys().copied().collect());
}
}
drop(index_accessor);
// Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
attach_downloaded_tenants(conf, &index, sync_status_updates);
apply_timeline_sync_status_updates(conf, &index, new_timeline_states);
}
}
ControlFlow::Break(()) => {
@@ -970,14 +938,6 @@ fn storage_sync_loop<P, S>(
}
}
// needed to check whether the download happened
// more informative than just a bool
#[derive(Debug)]
enum DownloadMarker {
Downloaded,
Nothing,
}
async fn process_batches<P, S>(
conf: &'static PageServerConf,
max_sync_errors: NonZeroU32,
@@ -985,7 +945,7 @@ async fn process_batches<P, S>(
index: &RemoteIndex,
batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
sync_queue: &SyncQueue,
) -> HashSet<ZTenantId>
) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1010,19 +970,22 @@ where
})
.collect::<FuturesUnordered<_>>();
let mut downloaded_timelines = HashSet::new();
let mut new_timeline_states: HashMap<
ZTenantId,
HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
> = HashMap::new();
while let Some((sync_id, download_marker)) = sync_results.next().await {
debug!(
"Finished storage sync task for sync id {sync_id} download marker {:?}",
download_marker
);
if matches!(download_marker, DownloadMarker::Downloaded) {
downloaded_timelines.insert(sync_id.tenant_id);
while let Some((sync_id, state_update)) = sync_results.next().await {
debug!("Finished storage sync task for sync id {sync_id}");
if let Some(state_update) = state_update {
new_timeline_states
.entry(sync_id.tenant_id)
.or_default()
.insert(sync_id.timeline_id, state_update);
}
}
downloaded_timelines
new_timeline_states
}
async fn process_sync_task_batch<P, S>(
@@ -1031,7 +994,7 @@ async fn process_sync_task_batch<P, S>(
max_sync_errors: NonZeroU32,
sync_id: ZTenantTimelineId,
batch: SyncTaskBatch,
) -> DownloadMarker
) -> Option<TimelineSyncStatusUpdate>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1116,7 +1079,7 @@ where
}
}
}
DownloadMarker::Nothing
None
}
.instrument(info_span!("download_timeline_data")),
);
@@ -1170,7 +1133,7 @@ async fn download_timeline_data<P, S>(
new_download_data: SyncData<LayersDownload>,
sync_start: Instant,
task_name: &str,
) -> DownloadMarker
) -> Option<TimelineSyncStatusUpdate>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -1199,7 +1162,7 @@ where
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
Ok(()) => {
register_sync_status(sync_id, sync_start, task_name, Some(true));
return DownloadMarker::Downloaded;
return Some(TimelineSyncStatusUpdate::Downloaded);
}
Err(e) => {
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
@@ -1215,7 +1178,7 @@ where
}
}
DownloadMarker::Nothing
None
}
async fn update_local_metadata(
@@ -1495,6 +1458,35 @@ async fn validate_task_retries<T>(
ControlFlow::Continue(sync_data)
}
async fn try_fetch_index_parts<P, S>(
conf: &'static PageServerConf,
storage: &S,
keys: HashSet<ZTenantTimelineId>,
) -> HashMap<ZTenantTimelineId, IndexPart>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
let mut index_parts = HashMap::with_capacity(keys.len());
let mut part_downloads = keys
.into_iter()
.map(|id| async move { (id, download_index_part(conf, storage, id).await) })
.collect::<FuturesUnordered<_>>();
while let Some((id, part_upload_result)) = part_downloads.next().await {
match part_upload_result {
Ok(index_part) => {
debug!("Successfully fetched index part for {id}");
index_parts.insert(id, index_part);
}
Err(e) => warn!("Failed to fetch index part for {id}: {e}"),
}
}
index_parts
}
fn schedule_first_sync_tasks(
index: &mut RemoteTimelineIndex,
sync_queue: &SyncQueue,
@@ -1557,7 +1549,6 @@ fn schedule_first_sync_tasks(
local_timeline_init_statuses
}
/// bool in return value stands for awaits_download
fn compare_local_and_remote_timeline(
new_sync_tasks: &mut VecDeque<(ZTenantTimelineId, SyncTask)>,
sync_id: ZTenantTimelineId,
@@ -1567,6 +1558,14 @@ fn compare_local_and_remote_timeline(
) -> (LocalTimelineInitStatus, bool) {
let remote_files = remote_entry.stored_files();
// TODO probably here we need more sophisticated logic,
// if more data is available remotely can we just download what's there?
// without trying to upload something. It may be tricky, needs further investigation.
// For now looks strange that we can request upload
// and download for the same timeline simultaneously.
// (upload needs to be only for previously unsynced files, not whole timeline dir).
// If one of the tasks fails they will be reordered in the queue which can lead
// to timeline being stuck in evicted state
let number_of_layers_to_download = remote_files.difference(&local_files).count();
let (initial_timeline_status, awaits_download) = if number_of_layers_to_download > 0 {
new_sync_tasks.push_back((

View File

@@ -1,15 +1,10 @@
//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory.
use std::{
collections::{HashMap, HashSet},
fmt::Debug,
mem,
path::Path,
};
use std::{collections::HashSet, fmt::Debug, path::Path};
use anyhow::Context;
use futures::stream::{FuturesUnordered, StreamExt};
use remote_storage::{path_with_suffix_extension, DownloadError, RemoteObjectName, RemoteStorage};
use remote_storage::{path_with_suffix_extension, RemoteStorage};
use tokio::{
fs,
io::{self, AsyncWriteExt},
@@ -19,7 +14,7 @@ use tracing::{debug, error, info, warn};
use crate::{
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
};
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
use utils::zid::ZTenantTimelineId;
use super::{
index::{IndexPart, RemoteTimeline},
@@ -28,155 +23,12 @@ use super::{
pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
// We collect timelines remotely available for each tenant
// in case we failed to gather all index parts (due to an error)
// Poisoned variant is returned.
// When data is received succesfully without errors Present variant is used.
pub enum TenantIndexParts {
Poisoned {
present: HashMap<ZTimelineId, IndexPart>,
missing: HashSet<ZTimelineId>,
},
Present(HashMap<ZTimelineId, IndexPart>),
}
impl TenantIndexParts {
fn add_poisoned(&mut self, timeline_id: ZTimelineId) {
match self {
TenantIndexParts::Poisoned { missing, .. } => {
missing.insert(timeline_id);
}
TenantIndexParts::Present(present) => {
*self = TenantIndexParts::Poisoned {
present: mem::take(present),
missing: HashSet::from([timeline_id]),
}
}
}
}
}
impl Default for TenantIndexParts {
fn default() -> Self {
TenantIndexParts::Present(HashMap::default())
}
}
pub async fn download_index_parts<P, S>(
conf: &'static PageServerConf,
storage: &S,
keys: HashSet<ZTenantTimelineId>,
) -> HashMap<ZTenantId, TenantIndexParts>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
let mut index_parts: HashMap<ZTenantId, TenantIndexParts> = HashMap::new();
let mut part_downloads = keys
.into_iter()
.map(|id| async move { (id, download_index_part(conf, storage, id).await) })
.collect::<FuturesUnordered<_>>();
while let Some((id, part_upload_result)) = part_downloads.next().await {
match part_upload_result {
Ok(index_part) => {
debug!("Successfully fetched index part for {id}");
match index_parts.entry(id.tenant_id).or_default() {
TenantIndexParts::Poisoned { present, .. } => {
present.insert(id.timeline_id, index_part);
}
TenantIndexParts::Present(parts) => {
parts.insert(id.timeline_id, index_part);
}
}
}
Err(download_error) => {
match download_error {
DownloadError::NotFound => {
// thats ok because it means that we didnt upload something we have locally for example
}
e => {
let tenant_parts = index_parts.entry(id.tenant_id).or_default();
tenant_parts.add_poisoned(id.timeline_id);
error!(
"Failed to fetch index part for {id}: {e} poisoning tenant index parts"
);
}
}
}
}
}
index_parts
}
/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests.
/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines).
/// And then will attempt to download all index files that belong to these timelines.
pub async fn gather_tenant_timelines_index_parts<P, S>(
conf: &'static PageServerConf,
storage: &S,
tenant_id: ZTenantId,
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>>
where
P: RemoteObjectName + Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
{
let tenant_path = conf.timelines_path(&tenant_id);
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
format!(
"Failed to get tenant storage path for local path '{}'",
tenant_path.display()
)
})?;
let timelines = storage
.list_prefixes(Some(tenant_storage_path))
.await
.with_context(|| {
format!(
"Failed to list tenant storage path to get remote timelines to download: {}",
tenant_id
)
})?;
let mut sync_ids = HashSet::new();
for timeline_remote_storage_key in timelines {
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
})?;
let timeline_id: ZTimelineId = object_name
.parse()
.with_context(|| {
format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'")
})?;
sync_ids.insert(ZTenantTimelineId {
tenant_id,
timeline_id,
});
}
match download_index_parts(conf, storage, sync_ids)
.await
.remove(&tenant_id)
.ok_or_else(|| anyhow::anyhow!("Missing tenant index parts. This is a bug."))?
{
TenantIndexParts::Poisoned { missing, .. } => {
anyhow::bail!("Failed to download index parts for all timelines. Missing {missing:?}")
}
TenantIndexParts::Present(parts) => Ok(parts),
}
}
/// Retrieves index data from the remote storage for a given timeline.
async fn download_index_part<P, S>(
pub async fn download_index_part<P, S>(
conf: &'static PageServerConf,
storage: &S,
sync_id: ZTenantTimelineId,
) -> Result<IndexPart, DownloadError>
) -> anyhow::Result<IndexPart>
where
P: Debug + Send + Sync + 'static,
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
@@ -191,29 +43,18 @@ where
"Failed to get the index part storage path for local path '{}'",
index_part_path.display()
)
})
.map_err(DownloadError::BadInput)?;
let mut index_part_download = storage.download(&part_storage_path).await?;
})?;
let mut index_part_bytes = Vec::new();
io::copy(
&mut index_part_download.download_stream,
&mut index_part_bytes,
)
.await
.with_context(|| {
format!("Failed to download an index part from storage path {part_storage_path:?}")
})
.map_err(DownloadError::Other)?;
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
storage
.download(&part_storage_path, &mut index_part_bytes)
.await
.with_context(|| {
format!(
"Failed to deserialize index part file from storage path '{part_storage_path:?}'"
)
})
.map_err(DownloadError::Other)?;
format!("Failed to download an index part from storage path {part_storage_path:?}")
})?;
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'")
})?;
let missing_files = index_part.missing_files();
if !missing_files.is_empty() {
@@ -321,19 +162,15 @@ where
temp_file_path.display()
)
})?;
let mut download = storage
.download(&layer_storage_path)
storage
.download(&layer_storage_path, &mut destination_file)
.await
.with_context(|| {
format!(
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
"Failed to download a layer from storage path '{layer_storage_path:?}'"
)
})?;
io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
format!(
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
)
})?;
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
// A file will not be closed immediately when it goes out of scope if there are any IO operations

View File

@@ -2,7 +2,6 @@
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
//! remote timeline layers and its metadata.
use std::ops::{Deref, DerefMut};
use std::{
collections::{HashMap, HashSet},
path::{Path, PathBuf},
@@ -13,15 +12,9 @@ use anyhow::{anyhow, Context, Ok};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use tokio::sync::RwLock;
use tracing::log::warn;
use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata};
use utils::{
lsn::Lsn,
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
};
use super::download::TenantIndexParts;
use utils::{lsn::Lsn, zid::ZTenantTimelineId};
/// A part of the filesystem path, that needs a root to become a path again.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
@@ -48,74 +41,38 @@ impl RelativePath {
}
}
#[derive(Debug, Clone, Default)]
pub struct TenantEntry(HashMap<ZTimelineId, RemoteTimeline>);
impl TenantEntry {
pub fn has_in_progress_downloads(&self) -> bool {
self.values()
.any(|remote_timeline| remote_timeline.awaits_download)
}
}
impl Deref for TenantEntry {
type Target = HashMap<ZTimelineId, RemoteTimeline>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl DerefMut for TenantEntry {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl From<HashMap<ZTimelineId, RemoteTimeline>> for TenantEntry {
fn from(inner: HashMap<ZTimelineId, RemoteTimeline>) -> Self {
Self(inner)
}
}
/// An index to track tenant files that exist on the remote storage.
#[derive(Debug, Clone, Default)]
#[derive(Debug, Clone)]
pub struct RemoteTimelineIndex {
entries: HashMap<ZTenantId, TenantEntry>,
timeline_entries: HashMap<ZTenantTimelineId, RemoteTimeline>,
}
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
#[derive(Default)]
pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
impl RemoteIndex {
pub fn empty() -> Self {
Self(Arc::new(RwLock::new(RemoteTimelineIndex {
timeline_entries: HashMap::new(),
})))
}
pub fn from_parts(
conf: &'static PageServerConf,
index_parts: HashMap<ZTenantId, TenantIndexParts>,
index_parts: HashMap<ZTenantTimelineId, IndexPart>,
) -> anyhow::Result<Self> {
let mut entries: HashMap<ZTenantId, TenantEntry> = HashMap::new();
let mut timeline_entries = HashMap::new();
for (tenant_id, index_parts) in index_parts {
match index_parts {
// TODO: should we schedule a retry so it can be recovered? otherwise we can revive it only through detach/attach or pageserver restart
TenantIndexParts::Poisoned { missing, ..} => warn!("skipping tenant_id set up for remote index because the index download has failed for timeline(s): {missing:?}"),
TenantIndexParts::Present(timelines) => {
for (timeline_id, index_part) in timelines {
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
let remote_timeline =
RemoteTimeline::from_index_part(&timeline_path, index_part)
.context("Failed to restore remote timeline data from index part")?;
entries
.entry(tenant_id)
.or_default()
.insert(timeline_id, remote_timeline);
}
},
}
for (sync_id, index_part) in index_parts {
let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
.context("Failed to restore remote timeline data from index part")?;
timeline_entries.insert(sync_id, remote_timeline);
}
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { entries }))))
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex {
timeline_entries,
}))))
}
pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
@@ -134,67 +91,20 @@ impl Clone for RemoteIndex {
}
impl RemoteTimelineIndex {
pub fn timeline_entry(
&self,
ZTenantTimelineId {
tenant_id,
timeline_id,
}: &ZTenantTimelineId,
) -> Option<&RemoteTimeline> {
self.entries.get(tenant_id)?.get(timeline_id)
pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> {
self.timeline_entries.get(id)
}
pub fn timeline_entry_mut(
&mut self,
ZTenantTimelineId {
tenant_id,
timeline_id,
}: &ZTenantTimelineId,
) -> Option<&mut RemoteTimeline> {
self.entries.get_mut(tenant_id)?.get_mut(timeline_id)
pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> {
self.timeline_entries.get_mut(id)
}
pub fn add_timeline_entry(
&mut self,
ZTenantTimelineId {
tenant_id,
timeline_id,
}: ZTenantTimelineId,
entry: RemoteTimeline,
) {
self.entries
.entry(tenant_id)
.or_default()
.insert(timeline_id, entry);
pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) {
self.timeline_entries.insert(id, entry);
}
pub fn remove_timeline_entry(
&mut self,
ZTenantTimelineId {
tenant_id,
timeline_id,
}: ZTenantTimelineId,
) -> Option<RemoteTimeline> {
self.entries
.entry(tenant_id)
.or_default()
.remove(&timeline_id)
}
pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> {
self.entries.get(tenant_id)
}
pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> {
self.entries.get_mut(tenant_id)
}
pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry {
self.entries.entry(tenant_id).or_default()
}
pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option<TenantEntry> {
self.entries.remove(tenant_id)
pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
self.timeline_entries.keys().copied()
}
pub fn set_awaits_download(

View File

@@ -37,7 +37,7 @@ pub mod defaults {
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000;
}
/// Per-tenant configuration options

View File

@@ -4,8 +4,8 @@
use crate::config::PageServerConf;
use crate::layered_repository::{load_metadata, LayeredRepository};
use crate::pgdatadir_mapping::DatadirTimeline;
use crate::repository::Repository;
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
use crate::repository::{Repository, TimelineSyncStatusUpdate};
use crate::storage_sync::index::RemoteIndex;
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
use crate::tenant_config::TenantConfOpt;
use crate::thread_mgr::ThreadKind;
@@ -13,11 +13,11 @@ use crate::timelines::CreateRepo;
use crate::walredo::PostgresRedoManager;
use crate::{thread_mgr, timelines, walreceiver};
use crate::{DatadirTimelineImpl, RepositoryImpl};
use anyhow::Context;
use anyhow::{bail, Context};
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
use tokio::sync::mpsc;
@@ -157,13 +157,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
// loading a tenant is serious, but it's better to complete the startup and
// serve other tenants, than fail completely.
error!("Failed to initialize local tenant {tenant_id}: {:?}", err);
if let Err(err) = set_tenant_state(tenant_id, TenantState::Broken) {
error!(
"Failed to set tenant state to broken {tenant_id}: {:?}",
err
);
}
set_tenant_state(tenant_id, TenantState::Broken)?;
}
}
@@ -171,51 +165,44 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
}
pub enum LocalTimelineUpdate {
Detach {
id: ZTenantTimelineId,
// used to signal to the detach caller that walreceiver successfully terminated for specified id
join_confirmation_sender: std::sync::mpsc::Sender<()>,
},
Attach {
id: ZTenantTimelineId,
datadir: Arc<DatadirTimelineImpl>,
},
Detach(ZTenantTimelineId),
Attach(ZTenantTimelineId, Arc<DatadirTimelineImpl>),
}
impl std::fmt::Debug for LocalTimelineUpdate {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(),
Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(),
Self::Detach(ttid) => f.debug_tuple("Remove").field(ttid).finish(),
Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(),
}
}
}
/// Updates tenants' repositories, changing their timelines state in memory.
pub fn attach_downloaded_tenants(
pub fn apply_timeline_sync_status_updates(
conf: &'static PageServerConf,
remote_index: &RemoteIndex,
sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>>,
sync_status_updates: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>,
) {
if sync_status_updates.is_empty() {
debug!("No sync status updates to apply");
debug!("no sync status updates to apply");
return;
}
for (tenant_id, downloaded_timelines) in sync_status_updates {
info!(
"Registering downlloaded timelines for {tenant_id} {} timelines",
downloaded_timelines.len()
);
debug!("Downloaded timelines: {downloaded_timelines:?}");
info!(
"Applying sync status updates for {} timelines",
sync_status_updates.len()
);
debug!("Sync status updates: {sync_status_updates:?}");
for (tenant_id, status_updates) in sync_status_updates {
let repo = match load_local_repo(conf, tenant_id, remote_index) {
Ok(repo) => repo,
Err(e) => {
error!("Failed to load repo for tenant {tenant_id} Error: {e:?}");
error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",);
continue;
}
};
match attach_downloaded_tenant(&repo, downloaded_timelines) {
match apply_timeline_remote_sync_status_updates(&repo, status_updates) {
Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"),
Err(e) => error!(
"Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}"
@@ -360,6 +347,7 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
);
// Wait until all gc/compaction tasks finish
// TODO send cancellation signal too, or make the state a watch
let repo = get_repository_for_tenant(tenant_id)?;
let _guard = repo.file_lock.write().unwrap();
}
@@ -400,86 +388,33 @@ pub fn get_local_timeline_with_load(
}
}
pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> {
// Start with the shutdown of timeline tasks (this shuts down the walreceiver)
// It is important that we do not take locks here, and do not check whether the timeline exists
// because if we hold tenants_state::write_tenants() while awaiting for the threads to join
// we cannot create new timelines and tenants, and that can take quite some time,
// it can even become stuck due to a bug making whole pageserver unavailable for some operations
// so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
// and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
// will synchronize and either fail with the not found error or succeed
pub fn detach_timeline(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
) -> anyhow::Result<()> {
// shutdown the timeline threads (this shuts down the walreceiver)
thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
let (sender, receiver) = std::sync::mpsc::channel::<()>();
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
id: ZTenantTimelineId::new(tenant_id, timeline_id),
join_confirmation_sender: sender,
});
debug!("waiting for wal receiver to shutdown");
let _ = receiver.recv();
debug!("wal receiver shutdown confirmed");
debug!("waiting for threads to shutdown");
thread_mgr::shutdown_threads(None, None, Some(timeline_id));
debug!("thread shutdown completed");
match tenants_state::write_tenants().get_mut(&tenant_id) {
Some(tenant) => {
tenant.repo.delete_timeline(timeline_id)?;
tenant
.repo
.detach_timeline(timeline_id)
.context("Failed to detach inmem tenant timeline")?;
tenant.local_timelines.remove(&timeline_id);
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach(
ZTenantTimelineId::new(tenant_id, timeline_id),
));
}
None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"),
None => bail!("Tenant {tenant_id} not found in local tenant state"),
}
Ok(())
}
pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> {
set_tenant_state(tenant_id, TenantState::Stopping)?;
// shutdown the tenant and timeline threads: gc, compaction, page service threads)
thread_mgr::shutdown_threads(None, Some(tenant_id), None);
// FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state?
// send stop signal to wal receiver and collect join handles while holding the lock
let walreceiver_join_handles = {
let tenants = tenants_state::write_tenants();
let tenant = tenants.get(&tenant_id).context("tenant not found")?;
let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len());
for timeline_id in tenant.local_timelines.keys() {
let (sender, receiver) = std::sync::mpsc::channel::<()>();
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
id: ZTenantTimelineId::new(tenant_id, *timeline_id),
join_confirmation_sender: sender,
});
walreceiver_join_handles.push((*timeline_id, receiver));
}
// drop the tenants lock
walreceiver_join_handles
};
// wait for wal receivers to stop without holding the lock, because walreceiver
// will attempt to change tenant state which is protected by the same global tenants lock.
// TODO do we need a timeout here? how to handle it?
// recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631
// need to use crossbeam-channel
for (timeline_id, join_handle) in walreceiver_join_handles {
info!("waiting for wal receiver to shutdown timeline_id {timeline_id}");
join_handle.recv().context("failed to join walreceiver")?;
info!("wal receiver shutdown confirmed timeline_id {timeline_id}");
}
tenants_state::write_tenants().remove(&tenant_id);
// If removal fails there will be no way to successfully retry detach,
// because tenant no longer exists in in memory map. And it needs to be removed from it
// before we remove files because it contains references to repository
// which references ephemeral files which are deleted on drop. So if we keep these references
// code will attempt to remove files which no longer exist. This can be fixed by having shutdown
// mechanism for repository that will clean temporary data to avoid any references to ephemeral files
let local_tenant_directory = conf.tenant_path(&tenant_id);
std::fs::remove_dir_all(&local_tenant_directory).with_context(|| {
let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id);
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
format!(
"Failed to remove local timeline directory '{}'",
local_tenant_directory.display()
local_timeline_directory.display()
)
})?;
@@ -500,10 +435,10 @@ fn load_local_timeline(
));
page_tline.init_logical_size()?;
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach {
id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
datadir: Arc::clone(&page_tline),
});
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach(
ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
Arc::clone(&page_tline),
));
Ok(page_tline)
}
@@ -513,27 +448,15 @@ fn load_local_timeline(
pub struct TenantInfo {
#[serde_as(as = "DisplayFromStr")]
pub id: ZTenantId,
pub state: Option<TenantState>,
pub has_in_progress_downloads: Option<bool>,
pub state: TenantState,
}
pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
pub fn list_tenants() -> Vec<TenantInfo> {
tenants_state::read_tenants()
.iter()
.map(|(id, tenant)| {
let has_in_progress_downloads = remote_index
.tenant_entry(id)
.map(|entry| entry.has_in_progress_downloads());
if has_in_progress_downloads.is_none() {
error!("timeline is not found in remote index while it is present in the tenants registry")
}
TenantInfo {
id: *id,
state: Some(tenant.state),
has_in_progress_downloads,
}
.map(|(id, tenant)| TenantInfo {
id: *id,
state: tenant.state,
})
.collect()
}
@@ -545,73 +468,74 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
/// A timeline is categorized as broken when any of following conditions is true:
/// - failed to load the timeline's metadata
/// - the timeline's disk consistent LSN is zero
fn check_broken_timeline(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
) -> anyhow::Result<()> {
let metadata =
load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?;
fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> {
let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id())
.context("failed to load metadata")?;
// A timeline with zero disk consistent LSN can happen when the page server
// failed to checkpoint the timeline import data when creating that timeline.
if metadata.disk_consistent_lsn() == Lsn::INVALID {
anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
}
Ok(())
}
/// Note: all timelines are attached at once if and only if all of them are locally complete
fn init_local_repository(
conf: &'static PageServerConf,
tenant_id: ZTenantId,
local_timeline_init_statuses: HashMap<ZTimelineId, LocalTimelineInitStatus>,
remote_index: &RemoteIndex,
) -> anyhow::Result<(), anyhow::Error> {
let mut timelines_to_attach = HashSet::new();
// initialize local tenant
let repo = load_local_repo(conf, tenant_id, remote_index)
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len());
for (timeline_id, init_status) in local_timeline_init_statuses {
match init_status {
LocalTimelineInitStatus::LocallyComplete => {
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
check_broken_timeline(conf, tenant_id, timeline_id)
.context("found broken timeline")?;
timelines_to_attach.insert(timeline_id);
if let Err(err) = check_broken_timeline(&repo, timeline_id) {
info!(
"Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository"
);
} else {
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
}
}
LocalTimelineInitStatus::NeedsSync => {
debug!(
"timeline {tenant_id} for tenant {timeline_id} needs sync, \
so skipped for adding into repository until sync is finished"
);
return Ok(());
}
}
}
// initialize local tenant
let repo = load_local_repo(conf, tenant_id, remote_index)
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
// Lets fail here loudly to be on the safe side.
// XXX: It may be a better api to actually distinguish between repository startup
// and processing of newly downloaded timelines.
attach_downloaded_tenant(&repo, timelines_to_attach)
apply_timeline_remote_sync_status_updates(&repo, status_updates)
.with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?;
Ok(())
}
fn attach_downloaded_tenant(
fn apply_timeline_remote_sync_status_updates(
repo: &LayeredRepository,
downloaded_timelines: HashSet<ZTimelineId>,
status_updates: HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
) -> anyhow::Result<()> {
let mut registration_queue = Vec::with_capacity(downloaded_timelines.len());
let mut registration_queue = Vec::with_capacity(status_updates.len());
// first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
for timeline_id in downloaded_timelines {
repo.attach_timeline(timeline_id).with_context(|| {
format!("Failed to load timeline {timeline_id} into in-memory repository")
})?;
registration_queue.push(timeline_id);
for (timeline_id, status_update) in status_updates {
repo.apply_timeline_remote_sync_status_update(timeline_id, status_update)
.with_context(|| {
format!("Failed to load timeline {timeline_id} into in-memory repository")
})?;
match status_update {
TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id),
}
}
for timeline_id in registration_queue {
@@ -619,7 +543,7 @@ fn attach_downloaded_tenant(
match tenants_state::write_tenants().get_mut(&tenant_id) {
Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
Entry::Occupied(_) => {
anyhow::bail!("Local timeline {timeline_id} already registered")
bail!("Local timeline {timeline_id} already registered")
}
Entry::Vacant(v) => {
v.insert(load_local_timeline(repo, timeline_id).with_context(|| {
@@ -627,7 +551,7 @@ fn attach_downloaded_tenant(
})?);
}
},
None => anyhow::bail!(
None => bail!(
"Tenant {} not found in local tenant state",
repo.tenant_id()
),

View File

@@ -119,6 +119,8 @@ pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
pub fn init_tenant_task_pool() -> anyhow::Result<()> {
let runtime = tokio::runtime::Builder::new_multi_thread()
.thread_name("tenant-task-worker")
.worker_threads(40) // Way more than necessary
.max_blocking_threads(100) // Way more than necessary
.enable_all()
.build()?;
@@ -176,7 +178,7 @@ pub fn init_tenant_task_pool() -> anyhow::Result<()> {
// Spawn new task, request cancellation of the old one if exists
let (cancel_send, cancel_recv) = watch::channel(());
let handle = tokio::spawn(gc_loop(tenantid, cancel_recv)
.instrument(info_span!("gc loop", tenant = %tenantid)));
.instrument(trace_span!("gc loop", tenant = %tenantid)));
if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) {
old_cancel_send.send(()).ok();
}
@@ -190,8 +192,9 @@ pub fn init_tenant_task_pool() -> anyhow::Result<()> {
// Spawn new task, request cancellation of the old one if exists
let (cancel_send, cancel_recv) = watch::channel(());
// TODO this instrument doesn't work
let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv)
.instrument(info_span!("compaction loop", tenant = %tenantid)));
.instrument(trace_span!("compaction loop", tenant = %tenantid)));
if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) {
old_cancel_send.send(()).ok();
}

View File

@@ -202,7 +202,7 @@ pub fn create_repo(
// anymore, but I think that could still happen.
let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {});
(wal_redo_manager as _, RemoteIndex::default())
(wal_redo_manager as _, RemoteIndex::empty())
}
};
@@ -347,7 +347,7 @@ pub(crate) fn create_timeline(
tenant_id: ZTenantId,
new_timeline_id: Option<ZTimelineId>,
ancestor_timeline_id: Option<ZTimelineId>,
mut ancestor_start_lsn: Option<Lsn>,
ancestor_start_lsn: Option<Lsn>,
) -> Result<Option<TimelineInfo>> {
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
@@ -357,35 +357,41 @@ pub(crate) fn create_timeline(
return Ok(None);
}
let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));
let new_timeline_info = match ancestor_timeline_id {
Some(ancestor_timeline_id) => {
let ancestor_timeline = repo
.get_timeline_load(ancestor_timeline_id)
.context("Cannot branch off the timeline that's not present locally")?;
if let Some(lsn) = ancestor_start_lsn.as_mut() {
if start_lsn == Lsn(0) {
// Find end of WAL on the old timeline
let end_of_wal = ancestor_timeline.get_last_record_lsn();
info!("branching at end of WAL: {}", end_of_wal);
start_lsn = end_of_wal;
} else {
// Wait for the WAL to arrive and be processed on the parent branch up
// to the requested branch point. The repository code itself doesn't
// require it, but if we start to receive WAL on the new timeline,
// decoding the new WAL might need to look up previous pages, relation
// sizes etc. and that would get confused if the previous page versions
// are not in the repository yet.
*lsn = lsn.align();
ancestor_timeline.wait_lsn(*lsn)?;
ancestor_timeline.wait_lsn(start_lsn)?;
}
start_lsn = start_lsn.align();
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
if ancestor_ancestor_lsn > *lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!(
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
if ancestor_ancestor_lsn > start_lsn {
// can we safely just branch from the ancestor instead?
anyhow::bail!(
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
lsn,
start_lsn,
ancestor_timeline_id,
ancestor_ancestor_lsn,
);
}
}
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?;
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
// load the timeline into memory
let loaded_timeline =
tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;

View File

@@ -91,6 +91,7 @@ pub fn init_wal_receiver_main_thread(
let runtime = tokio::runtime::Builder::new_multi_thread()
.thread_name("wal-receiver-runtime-thread")
.worker_threads(40)
.enable_all()
.on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true)))
.build()
@@ -177,7 +178,7 @@ async fn shutdown_all_wal_connections(
/// That may lead to certain events not being observed by the listener.
#[derive(Debug)]
struct TaskHandle<E> {
handle: JoinHandle<Result<(), String>>,
handle: JoinHandle<()>,
events_receiver: watch::Receiver<TaskEvent<E>>,
cancellation: watch::Sender<()>,
}
@@ -204,8 +205,8 @@ impl<E: Clone> TaskHandle<E> {
let sender = Arc::clone(&events_sender);
let handle = tokio::task::spawn(async move {
events_sender.send(TaskEvent::Started).ok();
task(sender, cancellation_receiver).await
let task_result = task(sender, cancellation_receiver).await;
events_sender.send(TaskEvent::End(task_result)).ok();
});
TaskHandle {
@@ -215,16 +216,6 @@ impl<E: Clone> TaskHandle<E> {
}
}
async fn next_task_event(&mut self) -> TaskEvent<E> {
select! {
next_task_event = self.events_receiver.changed() => match next_task_event {
Ok(()) => self.events_receiver.borrow().clone(),
Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await,
},
task_completion_result = join_on_handle(&mut self.handle) => task_completion_result,
}
}
/// Aborts current task, waiting for it to finish.
async fn shutdown(self) {
self.cancellation.send(()).ok();
@@ -234,19 +225,6 @@ impl<E: Clone> TaskHandle<E> {
}
}
async fn join_on_handle<E>(handle: &mut JoinHandle<Result<(), String>>) -> TaskEvent<E> {
match handle.await {
Ok(task_result) => TaskEvent::End(task_result),
Err(e) => {
if e.is_cancelled() {
TaskEvent::End(Ok(()))
} else {
TaskEvent::End(Err(format!("WAL receiver task panicked: {e}")))
}
}
}
}
/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery.
/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled.
/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled.
@@ -264,10 +242,7 @@ async fn wal_receiver_main_thread_loop_step<'a>(
info!("Processing timeline update: {update:?}");
match update {
// Timeline got detached, stop all related tasks and remove public timeline data.
LocalTimelineUpdate::Detach {
id,
join_confirmation_sender,
} => {
LocalTimelineUpdate::Detach(id) => {
match local_timeline_wal_receivers.get_mut(&id.tenant_id) {
Some(wal_receivers) => {
if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) {
@@ -283,48 +258,44 @@ async fn wal_receiver_main_thread_loop_step<'a>(
};
{
WAL_RECEIVER_ENTRIES.write().await.remove(&id);
if let Err(e) = join_confirmation_sender.send(()) {
warn!("cannot send wal_receiver shutdown confirmation {e}")
} else {
info!("confirm walreceiver shutdown for {id}");
}
}
}
// Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly.
LocalTimelineUpdate::Attach { id, datadir } => {
LocalTimelineUpdate::Attach(new_id, new_timeline) => {
let timeline_connection_managers = local_timeline_wal_receivers
.entry(id.tenant_id)
.entry(new_id.tenant_id)
.or_default();
if timeline_connection_managers.is_empty() {
if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await
if let Err(e) =
change_tenant_state(new_id.tenant_id, TenantState::Active).await
{
error!("Failed to make tenant active for id {id}: {e:#}");
error!("Failed to make tenant active for id {new_id}: {e:#}");
return;
}
}
let vacant_connection_manager_entry =
match timeline_connection_managers.entry(id.timeline_id) {
match timeline_connection_managers.entry(new_id.timeline_id) {
hash_map::Entry::Occupied(_) => {
debug!("Attepted to readd an existing timeline {id}, ignoring");
debug!("Attepted to readd an existing timeline {new_id}, ignoring");
return;
}
hash_map::Entry::Vacant(v) => v,
};
let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) =
match fetch_tenant_settings(id.tenant_id).await {
match fetch_tenant_settings(new_id.tenant_id).await {
Ok(settings) => settings,
Err(e) => {
error!("Failed to fetch tenant settings for id {id}: {e:#}");
error!("Failed to fetch tenant settings for id {new_id}: {e:#}");
return;
}
};
{
WAL_RECEIVER_ENTRIES.write().await.insert(
id,
new_id,
WalReceiverEntry {
wal_producer_connstr: None,
last_received_msg_lsn: None,
@@ -335,10 +306,10 @@ async fn wal_receiver_main_thread_loop_step<'a>(
vacant_connection_manager_entry.insert(
connection_manager::spawn_connection_manager_task(
id,
new_id,
broker_prefix.to_owned(),
etcd_client.clone(),
datadir,
new_timeline,
wal_connect_timeout,
lagging_wal_timeout,
max_lsn_wal_lag,

View File

@@ -104,29 +104,49 @@ async fn connection_manager_loop_step(
Some(wal_connection_update) = async {
match walreceiver_state.wal_connection.as_mut() {
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
Some(wal_connection) => {
let receiver = &mut wal_connection.connection_task.events_receiver;
Some(match receiver.changed().await {
Ok(()) => receiver.borrow().clone(),
Err(_cancellation_error) => TaskEvent::End(Ok(())),
})
}
None => None,
}
} => {
let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard");
match &wal_connection_update {
TaskEvent::Started => {
wal_connection.latest_connection_update = Utc::now().naive_utc();
*walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1;
},
TaskEvent::NewEvent(replication_feedback) => {
wal_connection.latest_connection_update = DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc();
// reset connection attempts here only, the only place where both nodes
// explicitly confirmn with replication feedback that they are connected to each other
walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
},
let (connection_update, reset_connection_attempts) = match &wal_connection_update {
TaskEvent::Started => (Some(Utc::now().naive_utc()), true),
TaskEvent::NewEvent(replication_feedback) => (Some(DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc()), true),
TaskEvent::End(end_result) => {
match end_result {
Ok(()) => debug!("WAL receiving task finished"),
Err(e) => warn!("WAL receiving task failed: {e}"),
let should_reset_connection_attempts = match end_result {
Ok(()) => {
debug!("WAL receiving task finished");
true
},
Err(e) => {
warn!("WAL receiving task failed: {e}");
false
},
};
walreceiver_state.wal_connection = None;
(None, should_reset_connection_attempts)
},
};
if let Some(connection_update) = connection_update {
match &mut walreceiver_state.wal_connection {
Some(wal_connection) => {
wal_connection.latest_connection_update = connection_update;
let attempts_entry = walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0);
if reset_connection_attempts {
*attempts_entry = 0;
} else {
*attempts_entry += 1;
}
},
None => error!("Received connection update for WAL connection that is not active, update: {wal_connection_update:?}"),
}
}
},
@@ -386,8 +406,10 @@ impl WalreceiverState {
Some(existing_wal_connection) => {
let connected_sk_node = existing_wal_connection.sk_id;
let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) =
self.select_connection_candidate(Some(connected_sk_node))?;
let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = self
.applicable_connection_candidates()
.filter(|&(sk_id, _, _)| sk_id != connected_sk_node)
.max_by_key(|(_, info, _)| info.commit_lsn)?;
let now = Utc::now().naive_utc();
if let Ok(latest_interaciton) =
@@ -440,8 +462,9 @@ impl WalreceiverState {
}
}
None => {
let (new_sk_id, _, new_wal_producer_connstr) =
self.select_connection_candidate(None)?;
let (new_sk_id, _, new_wal_producer_connstr) = self
.applicable_connection_candidates()
.max_by_key(|(_, info, _)| info.commit_lsn)?;
return Some(NewWalConnectionCandidate {
safekeeper_id: new_sk_id,
wal_producer_connstr: new_wal_producer_connstr,
@@ -453,49 +476,6 @@ impl WalreceiverState {
None
}
/// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
/// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
///
/// The candidate that is chosen:
/// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent)
/// * has greatest data Lsn among the ones that are left
///
/// NOTE:
/// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but
/// otherwise to reset the connection attempts, a successful connection to that node is needed.
/// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored.
fn select_connection_candidate(
&self,
node_to_omit: Option<NodeId>,
) -> Option<(NodeId, &SkTimelineInfo, String)> {
let all_candidates = self
.applicable_connection_candidates()
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
.collect::<Vec<_>>();
let smallest_attempts_allowed = all_candidates
.iter()
.map(|(sk_id, _, _)| {
self.wal_connection_attempts
.get(sk_id)
.copied()
.unwrap_or(0)
})
.min()?;
all_candidates
.into_iter()
.filter(|(sk_id, _, _)| {
smallest_attempts_allowed
>= self
.wal_connection_attempts
.get(sk_id)
.copied()
.unwrap_or(0)
})
.max_by_key(|(_, info, _)| info.commit_lsn)
}
fn applicable_connection_candidates(
&self,
) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, String)> {
@@ -520,25 +500,15 @@ impl WalreceiverState {
}
fn cleanup_old_candidates(&mut self) {
let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
self.wal_stream_candidates.retain(|node_id, etcd_info| {
self.wal_stream_candidates.retain(|_, etcd_info| {
if let Ok(time_since_latest_etcd_update) =
(Utc::now().naive_utc() - etcd_info.latest_update).to_std()
{
let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
if !should_retain {
node_ids_to_remove.push(*node_id);
}
should_retain
time_since_latest_etcd_update < self.lagging_wal_timeout
} else {
true
}
});
for node_id in node_ids_to_remove {
self.wal_connection_attempts.remove(&node_id);
}
}
}
@@ -873,64 +843,6 @@ mod tests {
Ok(())
}
#[tokio::test]
async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
let harness = RepoHarness::create("candidate_with_many_connection_failures")?;
let mut state = dummy_state(&harness);
let now = Utc::now().naive_utc();
let current_lsn = Lsn(100_000).align();
let bigger_lsn = Lsn(current_lsn.0 + 100).align();
state.wal_connection = None;
state.wal_stream_candidates = HashMap::from([
(
NodeId(0),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(bigger_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
},
etcd_version: 0,
latest_update: now,
},
),
(
NodeId(1),
EtcdSkTimeline {
timeline: SkTimelineInfo {
last_log_term: None,
flush_lsn: None,
commit_lsn: Some(current_lsn),
backup_lsn: None,
remote_consistent_lsn: None,
peer_horizon_lsn: None,
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
},
etcd_version: 0,
latest_update: now,
},
),
]);
state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]);
let candidate_with_less_errors = state
.next_connection_candidate()
.expect("Expected one candidate selected, but got none");
assert_eq!(
candidate_with_less_errors.safekeeper_id,
NodeId(1),
"Should select the node with less connection errors"
);
Ok(())
}
#[tokio::test]
async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> {
let harness = RepoHarness::create("connection_no_etcd_data_candidate")?;

View File

@@ -623,7 +623,6 @@ impl PostgresRedoProcess {
.env_clear()
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
.close_fds()
.output()
.map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?;

52
poetry.lock generated
View File

@@ -544,21 +544,20 @@ test = ["pytest (>=6.2.0)", "pytest-cov", "pytest-subtests", "pytest-xdist", "pr
[[package]]
name = "docker"
version = "4.2.2"
version = "5.0.3"
description = "A Python library for the Docker Engine API."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
python-versions = ">=3.6"
[package.dependencies]
pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""}
pywin32 = {version = "227", markers = "sys_platform == \"win32\""}
requests = ">=2.14.2,<2.18.0 || >2.18.0"
six = ">=1.4.0"
websocket-client = ">=0.32.0"
[package.extras]
ssh = ["paramiko (>=2.4.2)"]
tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=1.3.4)", "idna (>=2.0.0)"]
tls = ["pyOpenSSL (>=17.5.0)", "cryptography (>=3.4.7)", "idna (>=2.0.0)"]
[[package]]
name = "ecdsa"
@@ -1004,17 +1003,6 @@ python-versions = ">=3.6"
[package.extras]
diagrams = ["jinja2", "railroad-diagrams"]
[[package]]
name = "pypiwin32"
version = "223"
description = ""
category = "main"
optional = false
python-versions = "*"
[package.dependencies]
pywin32 = ">=223"
[[package]]
name = "pyrsistent"
version = "0.18.1"
@@ -1136,7 +1124,7 @@ python-versions = "*"
[[package]]
name = "pywin32"
version = "301"
version = "227"
description = "Python for Window Extensions"
category = "main"
optional = false
@@ -1513,8 +1501,8 @@ cryptography = [
{file = "cryptography-36.0.1.tar.gz", hash = "sha256:53e5c1dc3d7a953de055d77bef2ff607ceef7a2aac0353b5d630ab67f7423638"},
]
docker = [
{file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"},
{file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"},
{file = "docker-5.0.3-py2.py3-none-any.whl", hash = "sha256:7a79bb439e3df59d0a72621775d600bc8bc8b422d285824cb37103eab91d1ce0"},
{file = "docker-5.0.3.tar.gz", hash = "sha256:d916a26b62970e7c2f554110ed6af04c7ccff8e9f81ad17d0d40c75637e227fb"},
]
ecdsa = [
{file = "ecdsa-0.17.0-py2.py3-none-any.whl", hash = "sha256:5cf31d5b33743abe0dfc28999036c849a69d548f994b535e527ee3cb7f3ef676"},
@@ -1814,10 +1802,6 @@ pyparsing = [
{file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"},
{file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"},
]
pypiwin32 = [
{file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"},
{file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"},
]
pyrsistent = [
{file = "pyrsistent-0.18.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:df46c854f490f81210870e509818b729db4488e1f30f2a1ce1698b2295a878d1"},
{file = "pyrsistent-0.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d45866ececf4a5fff8742c25722da6d4c9e180daa7b405dc0a2a2790d668c26"},
@@ -1874,16 +1858,18 @@ pytz = [
{file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"},
]
pywin32 = [
{file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"},
{file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"},
{file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"},
{file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"},
{file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"},
{file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"},
{file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"},
{file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"},
{file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"},
{file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"},
{file = "pywin32-227-cp27-cp27m-win32.whl", hash = "sha256:371fcc39416d736401f0274dd64c2302728c9e034808e37381b5e1b22be4a6b0"},
{file = "pywin32-227-cp27-cp27m-win_amd64.whl", hash = "sha256:4cdad3e84191194ea6d0dd1b1b9bdda574ff563177d2adf2b4efec2a244fa116"},
{file = "pywin32-227-cp35-cp35m-win32.whl", hash = "sha256:f4c5be1a293bae0076d93c88f37ee8da68136744588bc5e2be2f299a34ceb7aa"},
{file = "pywin32-227-cp35-cp35m-win_amd64.whl", hash = "sha256:a929a4af626e530383a579431b70e512e736e9588106715215bf685a3ea508d4"},
{file = "pywin32-227-cp36-cp36m-win32.whl", hash = "sha256:300a2db938e98c3e7e2093e4491439e62287d0d493fe07cce110db070b54c0be"},
{file = "pywin32-227-cp36-cp36m-win_amd64.whl", hash = "sha256:9b31e009564fb95db160f154e2aa195ed66bcc4c058ed72850d047141b36f3a2"},
{file = "pywin32-227-cp37-cp37m-win32.whl", hash = "sha256:47a3c7551376a865dd8d095a98deba954a98f326c6fe3c72d8726ca6e6b15507"},
{file = "pywin32-227-cp37-cp37m-win_amd64.whl", hash = "sha256:31f88a89139cb2adc40f8f0e65ee56a8c585f629974f9e07622ba80199057511"},
{file = "pywin32-227-cp38-cp38-win32.whl", hash = "sha256:7f18199fbf29ca99dff10e1f09451582ae9e372a892ff03a28528a24d55875bc"},
{file = "pywin32-227-cp38-cp38-win_amd64.whl", hash = "sha256:7c1ae32c489dc012930787f06244426f8356e129184a02c25aef163917ce158e"},
{file = "pywin32-227-cp39-cp39-win32.whl", hash = "sha256:c054c52ba46e7eb6b7d7dfae4dbd987a1bb48ee86debe3f245a2884ece46e295"},
{file = "pywin32-227-cp39-cp39-win_amd64.whl", hash = "sha256:f27cec5e7f588c3d1051651830ecc00294f90728d19c3bf6916e6dba93ea357c"},
]
pyyaml = [
{file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},

View File

@@ -115,7 +115,7 @@ mod tests {
Ok(())
});
waiter.await?;
let () = waiter.await?;
notifier.await?
}
}

View File

@@ -83,9 +83,7 @@ impl ElectionLeader {
) -> Result<bool> {
let resp = self.client.leader(election_name).await?;
let kv = resp
.kv()
.ok_or_else(|| anyhow!("failed to get leader response"))?;
let kv = resp.kv().ok_or(anyhow!("failed to get leader response"))?;
let leader = kv.value_str()?;
Ok(leader == candidate_name)

View File

@@ -637,17 +637,6 @@ where
&mut self,
msg: &VoteRequest,
) -> Result<Option<AcceptorProposerMessage>> {
// Once voted, we won't accept data from older proposers; flush
// everything we've already received so that new proposer starts
// streaming at end of our WAL, without overlap. Currently we truncate
// WAL at streaming point, so this avoids truncating already committed
// WAL.
//
// TODO: it would be smoother to not truncate committed piece at
// handle_elected instead. Currently not a big deal, as proposer is the
// only source of WAL; with peer2peer recovery it would be more
// important.
self.wal_store.flush_wal()?;
// initialize with refusal
let mut resp = VoteResponse {
term: self.state.acceptor_state.term,

View File

@@ -2,16 +2,18 @@ use anyhow::{Context, Result};
use etcd_broker::subscription_key::{
NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind,
};
use tokio::io::AsyncRead;
use tokio::task::JoinHandle;
use std::cmp::min;
use std::collections::HashMap;
use std::path::{Path, PathBuf};
use std::pin::Pin;
use std::sync::Arc;
use std::time::Duration;
use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI};
use postgres_ffi::xlog_utils::{
XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, MAX_SEND_SIZE, PG_TLI,
};
use remote_storage::{GenericRemoteStorage, RemoteStorage};
use tokio::fs::File;
use tokio::runtime::Builder;
@@ -450,41 +452,45 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
pub async fn read_object(
file_path: PathBuf,
offset: u64,
) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
let download = match REMOTE_STORAGE
.get()
.context("Failed to get remote storage")?
.as_ref()
.context("No remote storage configured")?
{
GenericRemoteStorage::Local(local_storage) => {
let source = local_storage.remote_object_id(&file_path)?;
) -> (impl AsyncRead, JoinHandle<Result<()>>) {
let storage = REMOTE_STORAGE.get().expect("failed to get remote storage");
info!(
"local download about to start from {} at offset {}",
source.display(),
offset
);
local_storage
.download_byte_range(&source, offset, None)
.await
let (mut pipe_writer, pipe_reader) = tokio::io::duplex(MAX_SEND_SIZE);
let copy_result = tokio::spawn(async move {
let res = match storage.as_ref().unwrap() {
GenericRemoteStorage::Local(local_storage) => {
let source = local_storage.remote_object_id(&file_path)?;
info!(
"local download about to start from {} at offset {}",
source.display(),
offset
);
local_storage
.download_byte_range(&source, offset, None, &mut pipe_writer)
.await
}
GenericRemoteStorage::S3(s3_storage) => {
let s3key = s3_storage.remote_object_id(&file_path)?;
info!(
"S3 download about to start from {:?} at offset {}",
s3key, offset
);
s3_storage
.download_byte_range(&s3key, offset, None, &mut pipe_writer)
.await
}
};
if let Err(e) = res {
error!("failed to download WAL segment from remote storage: {}", e);
Err(e)
} else {
Ok(())
}
GenericRemoteStorage::S3(s3_storage) => {
let s3key = s3_storage.remote_object_id(&file_path)?;
});
info!(
"S3 download about to start from {:?} at offset {}",
s3key, offset
);
s3_storage.download_byte_range(&s3key, offset, None).await
}
}
.with_context(|| {
format!(
"Failed to open WAL segment download stream for local storage path {}",
file_path.display()
)
})?;
Ok(download.download_stream)
(pipe_reader, copy_result)
}

View File

@@ -604,7 +604,8 @@ impl WalReader {
// Try to open remote file, if remote reads are enabled
if self.enable_remote_read {
return read_object(wal_file_path, xlogoff as u64).await;
let (reader, _) = read_object(wal_file_path, xlogoff as u64).await;
return Ok(Box::pin(reader));
}
bail!("WAL segment is not found")

View File

@@ -28,10 +28,6 @@ strict = true
# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
ignore_missing_imports = true
[mypy-pg8000.*]
# Used only in testing clients
ignore_missing_imports = true
[mypy-cached_property.*]
ignore_missing_imports = true

View File

@@ -1,3 +1,6 @@
from contextlib import closing
import psycopg2.extras
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
@@ -105,3 +108,16 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
branch2_cur.execute('SELECT count(*) FROM foo')
assert branch2_cur.fetchone() == (300000, )
def test_ancestor_branch_detach(neon_simple_env: NeonEnv):
env = neon_simple_env
parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_detach_parent", "empty")
env.neon_cli.create_branch("test_ancestor_branch_detach_branch1",
"test_ancestor_branch_detach_parent")
ps_http = env.pageserver.http_client()
with pytest.raises(NeonPageserverApiException, match="Failed to detach inmem tenant timeline"):
ps_http.timeline_detach(env.initial_tenant, parent_timeline_id)

View File

@@ -1,6 +1,8 @@
from contextlib import closing
from uuid import uuid4
from typing import Iterator
from uuid import UUID, uuid4
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
from requests.exceptions import HTTPError
import pytest

View File

@@ -1,9 +1,11 @@
from contextlib import closing, contextmanager
import psycopg2.extras
import pytest
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.neon_fixtures import PgProtocol, NeonEnvBuilder
from fixtures.log_helper import log
import os
import time
import asyncpg
from fixtures.neon_fixtures import Postgres
import threading

View File

@@ -1,6 +1,8 @@
import pytest
from contextlib import closing
from fixtures.neon_fixtures import NeonEnv
from fixtures.log_helper import log
#

View File

@@ -1,167 +0,0 @@
import threading
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import lsn_from_hex
# Test the GC implementation when running with branching.
# This test reproduces the issue https://github.com/neondatabase/neon/issues/707.
#
# Consider two LSNs `lsn1` and `lsn2` with some delta files as follows:
# ...
# p -> has an image layer xx_p with p < lsn1
# ...
# lsn1
# ...
# q -> has an image layer yy_q with lsn1 < q < lsn2
# ...
# lsn2
#
# Consider running a GC iteration such that the GC horizon is between p and lsn1
# ...
# p -> has an image layer xx_p with p < lsn1
# D_start -> is a delta layer D's start (e.g D = '...-...-D_start-D_end')
# ...
# GC_h -> is a gc horizon such that p < GC_h < lsn1
# ...
# lsn1
# ...
# D_end -> is a delta layer D's end
# ...
# q -> has an image layer yy_q with lsn1 < q < lsn2
# ...
# lsn2
#
# As described in the issue #707, the image layer xx_p will be deleted as
# its range is below the GC horizon and there exists a newer image layer yy_q (q > p).
# However, removing xx_p will corrupt any delta layers that depend on xx_p that
# are not deleted by GC. For example, the delta layer D is corrupted in the
# above example because D depends on the image layer xx_p for value reconstruction.
#
# Because the delta layer D covering lsn1 is corrupted, creating a branch
# starting from lsn1 should return an error as follows:
# could not find data for key ... at LSN ..., for request at LSN ...
def test_branch_and_gc(neon_simple_env: NeonEnv):
env = neon_simple_env
tenant, _ = env.neon_cli.create_tenant(
conf={
# disable background GC
'gc_period': '10 m',
'gc_horizon': f'{10 * 1024 ** 3}',
# small checkpoint distance to create more delta layer files
'checkpoint_distance': f'{1024 ** 2}',
# set the target size to be large to allow the image layer to cover the whole key space
'compaction_target_size': f'{1024 ** 3}',
# tweak the default settings to allow quickly create image layers and L1 layers
'compaction_period': '1 s',
'compaction_threshold': '2',
'image_creation_threshold': '1',
# set PITR interval to be small, so we can do GC
'pitr_interval': '1 s'
})
timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant)
pg_main = env.postgres.create_start('test_main', tenant_id=tenant)
main_cur = pg_main.connect().cursor()
main_cur.execute(
"CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')"
)
main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
lsn1 = main_cur.fetchone()[0]
log.info(f'LSN1: {lsn1}')
main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
lsn2 = main_cur.fetchone()[0]
log.info(f'LSN2: {lsn2}')
# Set the GC horizon so that lsn1 is inside the horizon, which means
# we can create a new branch starting from lsn1.
env.pageserver.safe_psql(
f'''do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}'''
)
env.neon_cli.create_branch('test_branch',
'test_main',
tenant_id=tenant,
ancestor_start_lsn=lsn1)
pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant)
branch_cur = pg_branch.connect().cursor()
branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
branch_cur.execute('SELECT count(*) FROM foo')
assert branch_cur.fetchone() == (200000, )
# This test simulates a race condition happening when branch creation and GC are performed concurrently.
#
# Suppose we want to create a new timeline 't' from a source timeline 's' starting
# from a lsn 'lsn'. Upon creating 't', if we don't hold the GC lock and compare 'lsn' with
# the latest GC information carefully, it's possible for GC to accidentally remove data
# needed by the new timeline.
#
# In this test, GC is requested before the branch creation but is delayed to happen after branch creation.
# As a result, when doing GC for the source timeline, we don't have any information about
# the upcoming new branches, so it's possible to remove data that may be needed by the new branches.
# It's the branch creation task's job to make sure the starting 'lsn' is not out of scope
# and prevent creating branches with invalid starting LSNs.
#
# For more details, see discussion in https://github.com/neondatabase/neon/pull/2101#issuecomment-1185273447.
def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
env = neon_simple_env
# Disable background GC but set the `pitr_interval` to be small, so GC can delete something
tenant, _ = env.neon_cli.create_tenant(
conf={
# disable background GC
'gc_period': '10 m',
'gc_horizon': f'{10 * 1024 ** 3}',
# small checkpoint distance to create more delta layer files
'checkpoint_distance': f'{1024 ** 2}',
# set the target size to be large to allow the image layer to cover the whole key space
'compaction_target_size': f'{1024 ** 3}',
# tweak the default settings to allow quickly create image layers and L1 layers
'compaction_period': '1 s',
'compaction_threshold': '2',
'image_creation_threshold': '1',
# set PITR interval to be small, so we can do GC
'pitr_interval': '1 s'
})
b0 = env.neon_cli.create_branch('b0', tenant_id=tenant)
pg0 = env.postgres.create_start('b0', tenant_id=tenant)
res = pg0.safe_psql_many(queries=[
"CREATE TABLE t(key serial primary key)",
"INSERT INTO t SELECT FROM generate_series(1, 100000)",
"SELECT pg_current_wal_insert_lsn()",
"INSERT INTO t SELECT FROM generate_series(1, 100000)",
])
lsn = res[2][0][0]
# Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the
# branch creation task but the individual timeline GC iteration happens *after*
# the branch creation task.
env.pageserver.safe_psql(f"failpoints before-timeline-gc=sleep(2000)")
def do_gc():
env.pageserver.safe_psql(f"do_gc {tenant.hex} {b0.hex} 0")
thread = threading.Thread(target=do_gc, daemon=True)
thread.start()
# The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC.
with pytest.raises(Exception, match="invalid branch start lsn"):
env.neon_cli.create_branch('b1', 'b0', tenant_id=tenant, ancestor_start_lsn=lsn)

View File

@@ -1,3 +1,4 @@
import subprocess
from contextlib import closing
import psycopg2.extras

View File

@@ -1,89 +0,0 @@
from typing import List
import threading
import pytest
from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres
import time
import random
from fixtures.log_helper import log
from performance.test_perf_pgbench import get_scales_matrix
# Test branch creation
#
# This test spawns pgbench in a thread in the background, and creates a branch while
# pgbench is running. Then it launches pgbench on the new branch, and creates another branch.
# Repeat `n_branches` times.
#
# If 'ty' == 'cascade', each branch is created from the previous branch, so that you end
# up with a branch of a branch of a branch ... of a branch. With 'ty' == 'flat',
# each branch is created from the root.
@pytest.mark.parametrize("n_branches", [10])
@pytest.mark.parametrize("scale", get_scales_matrix(1))
@pytest.mark.parametrize("ty", ["cascade", "flat"])
def test_branching_with_pgbench(neon_simple_env: NeonEnv,
pg_bin: PgBin,
n_branches: int,
scale: int,
ty: str):
env = neon_simple_env
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
tenant, _ = env.neon_cli.create_tenant(
conf={
'gc_period': '5 s',
'gc_horizon': f'{1024 ** 2}',
'checkpoint_distance': f'{1024 ** 2}',
'compaction_target_size': f'{1024 ** 2}',
# set PITR interval to be small, so we can do GC
'pitr_interval': '5 s'
})
def run_pgbench(pg: Postgres):
connstr = pg.connstr()
log.info(f"Start a pgbench workload on pg {connstr}")
pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr])
pg_bin.run_capture(['pgbench', '-T15', connstr])
env.neon_cli.create_branch('b0', tenant_id=tenant)
pgs: List[Postgres] = []
pgs.append(env.postgres.create_start('b0', tenant_id=tenant))
threads: List[threading.Thread] = []
threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True))
threads[-1].start()
thread_limit = 4
for i in range(n_branches):
# random a delay between [0, 5]
delay = random.random() * 5
time.sleep(delay)
log.info(f"Sleep {delay}s")
# If the number of concurrent threads exceeds a threshold,
# wait for all the threads to finish before spawning a new one.
# Because tests defined in `batch_others` are run concurrently in CI,
# we want to avoid the situation that one test exhausts resources for other tests.
if len(threads) >= thread_limit:
for thread in threads:
thread.join()
threads = []
if ty == "cascade":
env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant)
else:
env.neon_cli.create_branch('b{}'.format(i + 1), 'b0', tenant_id=tenant)
pgs.append(env.postgres.create_start('b{}'.format(i + 1), tenant_id=tenant))
threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1], ), daemon=True))
threads[-1].start()
for thread in threads:
thread.join()
for pg in pgs:
res = pg.safe_psql('SELECT count(*) from pgbench_accounts')
assert res[0] == (100000 * scale, )

Some files were not shown because too many files have changed in this diff Show More