mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-18 05:30:37 +00:00
Compare commits
5 Commits
bojan-repl
...
sk-migrate
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
43acca1b12 | ||
|
|
191c5e5641 | ||
|
|
89755c045f | ||
|
|
f9c00e53de | ||
|
|
b1bfc23318 |
2
.circleci/ansible/.gitignore
vendored
2
.circleci/ansible/.gitignore
vendored
@@ -1,4 +1,2 @@
|
||||
zenith_install.tar.gz
|
||||
.zenith_current_version
|
||||
neon_install.tar.gz
|
||||
.neon_current_version
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
- name: Upload Neon binaries
|
||||
- name: Upload Zenith binaries
|
||||
hosts: storage
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
|
||||
tasks:
|
||||
|
||||
- name: get latest version of Neon binaries
|
||||
- name: get latest version of Zenith binaries
|
||||
register: current_version_file
|
||||
set_fact:
|
||||
current_version: "{{ lookup('file', '.neon_current_version') | trim }}"
|
||||
current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
@@ -19,11 +19,11 @@
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: upload and extract Neon binaries to /usr/local
|
||||
- name: upload and extract Zenith binaries to /usr/local
|
||||
ansible.builtin.unarchive:
|
||||
owner: root
|
||||
group: root
|
||||
src: neon_install.tar.gz
|
||||
src: zenith_install.tar.gz
|
||||
dest: /usr/local
|
||||
become: true
|
||||
tags:
|
||||
@@ -63,18 +63,21 @@
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: update remote storage (s3) config
|
||||
lineinfile:
|
||||
path: /storage/pageserver/data/pageserver.toml
|
||||
line: "{{ item }}"
|
||||
loop:
|
||||
- "[remote_storage]"
|
||||
- "bucket_name = '{{ bucket_name }}'"
|
||||
- "bucket_region = '{{ bucket_region }}'"
|
||||
- "prefix_in_bucket = '{{ inventory_hostname }}'"
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
# It seems that currently S3 integration does not play well
|
||||
# even with fresh pageserver without a burden of old data.
|
||||
# TODO: turn this back on once the issue is solved.
|
||||
# - name: update remote storage (s3) config
|
||||
# lineinfile:
|
||||
# path: /storage/pageserver/data/pageserver.toml
|
||||
# line: "{{ item }}"
|
||||
# loop:
|
||||
# - "[remote_storage]"
|
||||
# - "bucket_name = '{{ bucket_name }}'"
|
||||
# - "bucket_region = '{{ bucket_region }}'"
|
||||
# - "prefix_in_bucket = '{{ inventory_hostname }}'"
|
||||
# become: true
|
||||
# tags:
|
||||
# - pageserver
|
||||
|
||||
- name: upload systemd service definition
|
||||
ansible.builtin.template:
|
||||
|
||||
@@ -4,10 +4,10 @@ set -e
|
||||
|
||||
RELEASE=${RELEASE:-false}
|
||||
|
||||
# look at docker hub for latest tag for neon docker image
|
||||
# look at docker hub for latest tag fo zenith docker image
|
||||
if [ "${RELEASE}" = "true" ]; then
|
||||
echo "search latest relase tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
@@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then
|
||||
fi
|
||||
else
|
||||
echo "search latest dev tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
@@ -28,25 +28,25 @@ fi
|
||||
echo "found ${VERSION}"
|
||||
|
||||
# do initial cleanup
|
||||
rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
|
||||
mkdir neon_install
|
||||
rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version
|
||||
mkdir zenith_install
|
||||
|
||||
# retrive binaries from docker image
|
||||
echo "getting binaries from docker image"
|
||||
docker pull --quiet neondatabase/neon:${TAG}
|
||||
ID=$(docker create neondatabase/neon:${TAG})
|
||||
docker pull --quiet zenithdb/zenith:${TAG}
|
||||
ID=$(docker create zenithdb/zenith:${TAG})
|
||||
docker cp ${ID}:/data/postgres_install.tar.gz .
|
||||
tar -xzf postgres_install.tar.gz -C neon_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/
|
||||
tar -xzf postgres_install.tar.gz -C zenith_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/
|
||||
docker rm -vf ${ID}
|
||||
|
||||
# store version to file (for ansible playbooks) and create binaries tarball
|
||||
echo ${VERSION} > neon_install/.neon_current_version
|
||||
echo ${VERSION} > .neon_current_version
|
||||
tar -czf neon_install.tar.gz -C neon_install .
|
||||
echo ${VERSION} > zenith_install/.zenith_current_version
|
||||
echo ${VERSION} > .zenith_current_version
|
||||
tar -czf zenith_install.tar.gz -C zenith_install .
|
||||
|
||||
# do final cleaup
|
||||
rm -rf neon_install postgres_install.tar.gz
|
||||
rm -rf zenith_install postgres_install.tar.gz
|
||||
|
||||
@@ -14,4 +14,3 @@ safekeepers
|
||||
console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
etcd_endpoints = etcd-release.local:2379
|
||||
|
||||
@@ -5,6 +5,7 @@ zenith-us-stage-ps-2 console_region_id=27
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-1 console_region_id=27
|
||||
zenith-us-stage-sk-2 console_region_id=27
|
||||
zenith-us-stage-sk-3 console_region_id=27
|
||||
zenith-us-stage-sk-4 console_region_id=27
|
||||
|
||||
[storage:children]
|
||||
@@ -15,4 +16,3 @@ safekeepers
|
||||
console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
etcd_endpoints = etcd-staging.local:2379
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
version: 2.1
|
||||
|
||||
executors:
|
||||
neon-xlarge-executor:
|
||||
zenith-xlarge-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
|
||||
- image: zimg/rust:1.58
|
||||
neon-executor:
|
||||
- image: zimg/rust:1.56
|
||||
zenith-executor:
|
||||
docker:
|
||||
- image: zimg/rust:1.58
|
||||
- image: zimg/rust:1.56
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
@@ -22,7 +22,7 @@ jobs:
|
||||
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -67,9 +67,9 @@ jobs:
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
# A job to build Neon rust code
|
||||
build-neon:
|
||||
executor: neon-xlarge-executor
|
||||
# A job to build zenith rust code
|
||||
build-zenith:
|
||||
executor: zenith-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -113,7 +113,7 @@ jobs:
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS="--release --features profiling"
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
export CARGO_INCREMENTAL=0
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
export RUSTC_WRAPPER=cachepot
|
||||
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
||||
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
cachepot -s
|
||||
|
||||
- save_cache:
|
||||
@@ -132,6 +132,20 @@ jobs:
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run style checks
|
||||
# has to run separately from cargo fmt section
|
||||
# since needs to run with dependencies
|
||||
- run:
|
||||
name: cargo clippy
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" ./run_clippy.sh
|
||||
|
||||
# Run rust unit tests
|
||||
- run:
|
||||
name: cargo test
|
||||
@@ -209,7 +223,7 @@ jobs:
|
||||
- "*"
|
||||
|
||||
check-codestyle-python:
|
||||
executor: neon-executor
|
||||
executor: zenith-executor
|
||||
steps:
|
||||
- checkout
|
||||
- restore_cache:
|
||||
@@ -232,7 +246,7 @@ jobs:
|
||||
command: poetry run mypy .
|
||||
|
||||
run-pytest:
|
||||
executor: neon-executor
|
||||
executor: zenith-executor
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
@@ -355,7 +369,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -376,7 +390,7 @@ jobs:
|
||||
- "*"
|
||||
|
||||
coverage-report:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
@@ -391,7 +405,7 @@ jobs:
|
||||
- run:
|
||||
name: Build coverage report
|
||||
command: |
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/coverage \
|
||||
--dir=/tmp/zenith/coverage report \
|
||||
@@ -402,11 +416,11 @@ jobs:
|
||||
name: Upload coverage report
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
||||
|
||||
@@ -423,7 +437,7 @@ jobs:
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
# Build neondatabase/neon:latest image and push it to Docker hub
|
||||
# Build zenithdb/zenith:latest image and push it to Docker hub
|
||||
docker-image:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -437,18 +451,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:latest
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:latest
|
||||
|
||||
# Build neondatabase/compute-node:latest image and push it to Docker hub
|
||||
# Build zenithdb/compute-node:latest image and push it to Docker hub
|
||||
docker-image-compute:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -456,31 +470,31 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build neondatabase/compute-tools:latest image and push it to Docker hub
|
||||
# Build zenithdb/compute-tools:latest image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools .
|
||||
docker push neondatabase/compute-tools:latest
|
||||
--tag zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:latest
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:latest
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:latest
|
||||
|
||||
# Build production neondatabase/neon:release image and push it to Docker hub
|
||||
# Build production zenithdb/zenith:release image and push it to Docker hub
|
||||
docker-image-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -494,18 +508,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:release
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:release
|
||||
|
||||
# Build production neondatabase/compute-node:release image and push it to Docker hub
|
||||
# Build production zenithdb/compute-node:release image and push it to Docker hub
|
||||
docker-image-compute-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -513,29 +527,29 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build neondatabase/compute-tools:release image and push it to Docker hub
|
||||
# Build zenithdb/compute-tools:release image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:release -f Dockerfile.compute-tools .
|
||||
docker push neondatabase/compute-tools:release
|
||||
--tag zenithdb/compute-tools:release -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:release
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:release
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:release
|
||||
|
||||
deploy-staging:
|
||||
docker:
|
||||
@@ -561,7 +575,7 @@ jobs:
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i staging.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
|
||||
deploy-staging-proxy:
|
||||
docker:
|
||||
@@ -579,13 +593,13 @@ jobs:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
helm repo add zenithdb https://zenithdb.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade zenith-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
|
||||
deploy-release:
|
||||
docker:
|
||||
@@ -611,7 +625,7 @@ jobs:
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i production.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
|
||||
deploy-release-proxy:
|
||||
docker:
|
||||
@@ -629,7 +643,7 @@ jobs:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add zenithdb https://neondatabase.github.io/helm-charts
|
||||
helm repo add zenithdb https://zenithdb.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
@@ -658,7 +672,7 @@ jobs:
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"context\": \"zenith-remote-ci\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
- run:
|
||||
@@ -674,7 +688,7 @@ jobs:
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"ci_job_name\": \"zenith-remote-ci\",
|
||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
||||
\"remote_repo\": \"$LOCAL_REPO\"
|
||||
}
|
||||
@@ -690,8 +704,8 @@ workflows:
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-neon:
|
||||
name: build-neon-<< matrix.build_type >>
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
@@ -706,7 +720,7 @@ workflows:
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
requires:
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
@@ -714,7 +728,7 @@ workflows:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_others
|
||||
requires:
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: benchmarks
|
||||
context: PERF_TEST_RESULT_CONNSTR
|
||||
@@ -723,7 +737,7 @@ workflows:
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
requires:
|
||||
- build-neon-release
|
||||
- build-zenith-release
|
||||
- coverage-report:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
@@ -814,11 +828,11 @@ workflows:
|
||||
- remote-ci-trigger:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
remote_repo: "neondatabase/cloud"
|
||||
remote_repo: "zenithdb/console"
|
||||
requires:
|
||||
# XXX: Successful build doesn't mean everything is OK, but
|
||||
# the job to be triggered takes so much time to complete (~22 min)
|
||||
# that it's better not to wait for the commented-out steps
|
||||
- build-neon-release
|
||||
- build-zenith-debug
|
||||
# - pg_regress-tests-release
|
||||
# - other-tests-release
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.neon.tech/psql_session/"
|
||||
authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.zenith.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -28,7 +25,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: start.zenith.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local:9095/management/api/v2"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: staging
|
||||
zenith_region: us-east-1
|
||||
zenith_region_slug: virginia
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: *.cloud.stage.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,12 +1,9 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.zenith.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -20,7 +17,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
@@ -10,8 +10,6 @@ dep-format-version = "2"
|
||||
# Hakari works much better with the new feature resolver.
|
||||
# For more about the new feature resolver, see:
|
||||
# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver
|
||||
# Have to keep the resolver still here since hakari requires this field,
|
||||
# despite it's now the default for 2021 edition & cargo.
|
||||
resolver = "2"
|
||||
|
||||
# Add triples corresponding to platforms commonly used by developers here.
|
||||
|
||||
13
.github/workflows/benchmarking.yml
vendored
13
.github/workflows/benchmarking.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
|
||||
PG_BIN: "/usr/pgsql-13/bin"
|
||||
|
||||
steps:
|
||||
- name: Checkout zenith repo
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
echo Poetry
|
||||
poetry --version
|
||||
echo Pgbench
|
||||
$POSTGRES_DISTRIB_DIR/bin/pgbench --version
|
||||
$PG_BIN/pgbench --version
|
||||
|
||||
# FIXME cluster setup is skipped due to various changes in console API
|
||||
# for now pre created cluster is used. When API gain some stability
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
|
||||
echo "Starting cluster"
|
||||
# wake up the cluster
|
||||
$POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1"
|
||||
$PG_BIN/psql $BENCHMARK_CONNSTR -c "SELECT 1"
|
||||
|
||||
- name: Run benchmark
|
||||
# pgbench is installed system wide from official repo
|
||||
@@ -83,11 +83,8 @@ jobs:
|
||||
# sudo yum install postgresql13-contrib
|
||||
# actual binaries are located in /usr/pgsql-13/bin/
|
||||
env:
|
||||
# The pgbench test runs two tests of given duration against each scale.
|
||||
# So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes.
|
||||
# Plus time needed to initialize the test databases.
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,15"
|
||||
PLATFORM: "zenith-staging"
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||
|
||||
17
.github/workflows/testing.yml
vendored
17
.github/workflows/testing.yml
vendored
@@ -36,7 +36,8 @@ jobs:
|
||||
|
||||
- name: Install macOs postgres dependencies
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: brew install flex bison
|
||||
run: |
|
||||
brew install flex bison
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
@@ -52,7 +53,8 @@ jobs:
|
||||
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: make postgres
|
||||
run: |
|
||||
make postgres
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
@@ -62,10 +64,13 @@ jobs:
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
# Use `env CARGO_INCREMENTAL=0` to mitigate https://github.com/rust-lang/rust/issues/91696 for rustc 1.57.0
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo build --workspace --bins --examples --tests
|
||||
|
||||
- name: Run cargo test
|
||||
run: cargo test --all --all-targets
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo test -- --nocapture --test-threads=1
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,6 +11,3 @@ test_output/
|
||||
# Coverage
|
||||
*.profraw
|
||||
*.profdata
|
||||
|
||||
*.key
|
||||
*.crt
|
||||
|
||||
904
Cargo.lock
generated
904
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
13
Cargo.toml
13
Cargo.toml
@@ -3,19 +3,22 @@ members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"walkeeper",
|
||||
"workspace_hack",
|
||||
"zenith",
|
||||
"libs/*",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
]
|
||||
resolver = "2"
|
||||
|
||||
[profile.release]
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
# This is only needed for proxy's tests
|
||||
# TODO: we should probably fork tokio-postgres-rustls instead
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
|
||||
15
Dockerfile
15
Dockerfile
@@ -1,5 +1,7 @@
|
||||
# Build Postgres
|
||||
FROM zimg/rust:1.58 AS pg-build
|
||||
#
|
||||
#FROM zimg/rust:1.56 AS pg-build
|
||||
FROM zenithdb/build:buster-20220309 AS pg-build
|
||||
WORKDIR /pg
|
||||
|
||||
USER root
|
||||
@@ -9,26 +11,27 @@ COPY Makefile Makefile
|
||||
|
||||
ENV BUILD_TYPE release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s postgres \
|
||||
&& make -j $(nproc) -s postgres \
|
||||
&& rm -rf tmp_install/build \
|
||||
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
||||
|
||||
# Build zenith binaries
|
||||
FROM zimg/rust:1.58 AS build
|
||||
#
|
||||
#FROM zimg/rust:1.56 AS build
|
||||
FROM zenithdb/build:buster-20220309 AS build
|
||||
ARG GIT_VERSION=local
|
||||
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
ARG AWS_ACCESS_KEY_ID
|
||||
ARG AWS_SECRET_ACCESS_KEY
|
||||
ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot
|
||||
|
||||
COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats.
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build --release \
|
||||
&& cachepot -s
|
||||
RUN cargo build --release && /usr/local/cargo/bin/cachepot -s
|
||||
|
||||
# Build final image
|
||||
#
|
||||
|
||||
23
Dockerfile.build
Normal file
23
Dockerfile.build
Normal file
@@ -0,0 +1,23 @@
|
||||
FROM rust:1.56.1-slim-buster
|
||||
WORKDIR /home/circleci/project
|
||||
|
||||
RUN set -e \
|
||||
&& apt-get update \
|
||||
&& apt-get -yq install \
|
||||
automake \
|
||||
libtool \
|
||||
build-essential \
|
||||
bison \
|
||||
flex \
|
||||
libreadline-dev \
|
||||
zlib1g-dev \
|
||||
libxml2-dev \
|
||||
libseccomp-dev \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
clang
|
||||
|
||||
RUN set -e \
|
||||
&& rustup component add clippy \
|
||||
&& cargo install cargo-audit \
|
||||
&& cargo install --git https://github.com/paritytech/cachepot
|
||||
@@ -1,18 +1,19 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .circle/config.yml
|
||||
FROM zimg/rust:1.58 AS rust-build
|
||||
FROM zenithdb/build:buster-20220309 AS rust-build
|
||||
|
||||
WORKDIR /zenith
|
||||
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
ARG AWS_ACCESS_KEY_ID
|
||||
ARG AWS_SECRET_ACCESS_KEY
|
||||
ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \
|
||||
&& cachepot -s
|
||||
RUN cargo build -p compute_tools --release && /usr/local/cargo/bin/cachepot -s
|
||||
|
||||
# Final image that only has one binary
|
||||
FROM debian:buster-slim
|
||||
|
||||
COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl
|
||||
COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
31
README.md
31
README.md
@@ -1,22 +1,19 @@
|
||||
# Neon
|
||||
# Zenith
|
||||
|
||||
Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
The project used to be called "Zenith". Many of the commands and code comments
|
||||
still refer to "zenith", but we are in the process of renaming things.
|
||||
Zenith is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
## Architecture overview
|
||||
|
||||
A Neon installation consists of compute nodes and Neon storage engine.
|
||||
A Zenith installation consists of compute nodes and Zenith storage engine.
|
||||
|
||||
Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine.
|
||||
Compute nodes are stateless PostgreSQL nodes, backed by Zenith storage engine.
|
||||
|
||||
Neon storage engine consists of two major components:
|
||||
Zenith storage engine consists of two major components:
|
||||
- Pageserver. Scalable storage backend for compute nodes.
|
||||
- WAL service. The service that receives WAL from compute node and ensures that it is stored durably.
|
||||
|
||||
Pageserver consists of:
|
||||
- Repository - Neon storage implementation.
|
||||
- Repository - Zenith storage implementation.
|
||||
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
|
||||
- Page service - service that communicates with compute nodes and responds with pages from the repository.
|
||||
- WAL redo - service that builds pages from base images and WAL records on Page service request.
|
||||
@@ -31,17 +28,17 @@ apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libsec
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
```
|
||||
|
||||
[Rust] 1.58 or later is also required.
|
||||
[Rust] 1.56.1 or later is also required.
|
||||
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
|
||||
|
||||
2. Build neon and patched postgres
|
||||
2. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
cd zenith
|
||||
make -j5
|
||||
```
|
||||
|
||||
@@ -129,7 +126,7 @@ INSERT 0 1
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
./scripts/pytest
|
||||
```
|
||||
@@ -144,14 +141,14 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d
|
||||
|
||||
### Postgres-specific terms
|
||||
|
||||
Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used.
|
||||
Due to Zenith's very close relation with PostgreSQL internals, there are numerous specific terms used.
|
||||
Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
|
||||
|
||||
To get more familiar with this aspect, refer to:
|
||||
|
||||
- [Neon glossary](/docs/glossary.md)
|
||||
- [Zenith glossary](/docs/glossary.md)
|
||||
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
|
||||
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
|
||||
- Other PostgreSQL documentation and sources (Zenith fork sources can be found [here](https://github.com/zenithdb/postgres))
|
||||
|
||||
## Join the development
|
||||
|
||||
|
||||
@@ -11,11 +11,10 @@ clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
log = { version = "0.4", features = ["std", "serde"] }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
regex = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tar = "0.4"
|
||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -38,7 +38,6 @@ use clap::Arg;
|
||||
use log::info;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use compute_tools::checker::create_writablity_check_data;
|
||||
use compute_tools::config;
|
||||
use compute_tools::http_api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
@@ -129,8 +128,6 @@ fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
|
||||
|
||||
handle_roles(&read_state.spec, &mut client)?;
|
||||
handle_databases(&read_state.spec, &mut client)?;
|
||||
handle_grants(&read_state.spec, &mut client)?;
|
||||
create_writablity_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
@@ -158,7 +155,7 @@ fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `utils::logging` later
|
||||
// TODO: re-use `zenith_utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
|
||||
@@ -1,46 +0,0 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use log::error;
|
||||
use postgres::Client;
|
||||
use tokio_postgres::NoTls;
|
||||
|
||||
use crate::zenith::ComputeState;
|
||||
|
||||
pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
||||
let query = "
|
||||
CREATE TABLE IF NOT EXISTS health_check (
|
||||
id serial primary key,
|
||||
updated_at timestamptz default now()
|
||||
);
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();";
|
||||
let result = client.simple_query(query)?;
|
||||
if result.len() < 2 {
|
||||
return Err(anyhow::format_err!("executed {} queries", result.len()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_writability(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
|
||||
let connstr = state.read().unwrap().connstr.clone();
|
||||
let (client, connection) = tokio_postgres::connect(&connstr, NoTls).await?;
|
||||
if client.is_closed() {
|
||||
return Err(anyhow!("connection to postgres closed"));
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let result = client
|
||||
.simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;")
|
||||
.await?;
|
||||
|
||||
if result.len() != 1 {
|
||||
return Err(anyhow!("statement can't be executed"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -11,7 +11,7 @@ use log::{error, info};
|
||||
use crate::zenith::*;
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
|
||||
fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Timestamp of the last Postgres activity in the plain text.
|
||||
(&Method::GET, "/last_activity") => {
|
||||
@@ -29,15 +29,6 @@ async fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Respons
|
||||
Response::new(Body::from(format!("{}", state.ready)))
|
||||
}
|
||||
|
||||
(&Method::GET, "/check_writability") => {
|
||||
info!("serving /check_writability GET request");
|
||||
let res = crate::checker::check_writability(&state).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
@@ -57,7 +48,7 @@ async fn serve(state: Arc<RwLock<ComputeState>>) {
|
||||
async move {
|
||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||
let state = state.clone();
|
||||
async move { Ok::<_, Infallible>(routes(req, state).await) }
|
||||
async move { Ok::<_, Infallible>(routes(req, state)) }
|
||||
}))
|
||||
}
|
||||
});
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! Various tools and helpers to handle cluster / compute node (Postgres)
|
||||
//! configuration.
|
||||
//!
|
||||
pub mod checker;
|
||||
pub mod config;
|
||||
pub mod http_api;
|
||||
#[macro_use]
|
||||
|
||||
@@ -132,14 +132,7 @@ impl Role {
|
||||
let mut params: String = "LOGIN".to_string();
|
||||
|
||||
if let Some(pass) = &self.encrypted_password {
|
||||
// Some time ago we supported only md5 and treated all encrypted_password as md5.
|
||||
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
||||
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
||||
if pass.starts_with("SCRAM-SHA-256") {
|
||||
params.push_str(&format!(" PASSWORD '{}'", pass));
|
||||
} else {
|
||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
||||
}
|
||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
||||
} else {
|
||||
params.push_str(" PASSWORD NULL");
|
||||
}
|
||||
|
||||
@@ -244,24 +244,3 @@ pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Grant CREATE ON DATABASE to the database owner
|
||||
// to allow clients create trusted extensions.
|
||||
pub fn handle_grants(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
let dbname = &db.name;
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.quote(),
|
||||
db.owner.quote()
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
toml = "0.5"
|
||||
@@ -18,6 +18,6 @@ url = "2.2.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
safekeeper = { path = "../safekeeper" }
|
||||
utils = { path = "../libs/utils" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -11,12 +11,11 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
@@ -273,7 +272,12 @@ impl PostgresNode {
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
conf.append("wal_keep_size", "0");
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeeper or
|
||||
// page server yet. (gh issue #349)
|
||||
conf.append("wal_keep_size", "10TB");
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
@@ -327,14 +331,14 @@ impl PostgresNode {
|
||||
// Configure the node to connect to the safekeepers
|
||||
conf.append("synchronous_standby_names", "walproposer");
|
||||
|
||||
let safekeepers = self
|
||||
let wal_acceptors = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("wal_acceptors", &safekeepers);
|
||||
conf.append("wal_acceptors", &wal_acceptors);
|
||||
} else {
|
||||
// We only use setup without safekeepers for tests,
|
||||
// and don't care about data durability on pageserver,
|
||||
@@ -416,15 +420,10 @@ impl PostgresNode {
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
}
|
||||
let pg_ctl = cmd.status().context("pg_ctl failed")?;
|
||||
|
||||
let pg_ctl = cmd.output().context("pg_ctl failed")?;
|
||||
if !pg_ctl.status.success() {
|
||||
anyhow::bail!(
|
||||
"pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
|
||||
pg_ctl.status,
|
||||
String::from_utf8_lossy(&pg_ctl.stdout),
|
||||
String::from_utf8_lossy(&pg_ctl.stderr),
|
||||
);
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_ctl failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -11,11 +11,9 @@ use std::env;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
|
||||
@@ -13,17 +13,15 @@ use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use safekeeper::http::models::TimelineCreateRequest;
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
use walkeeper::http::models::TimelineCreateRequest;
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
@@ -10,23 +9,21 @@ use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::timelines::TimelineInfo;
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use pageserver::tenant_mgr::TenantInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
@@ -345,36 +342,10 @@ impl PageServerNode {
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: Option<ZTenantId>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<Option<ZTenantId>> {
|
||||
let tenant_id_string = self
|
||||
.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
|
||||
.json(&TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.json(&TenantCreateRequest { new_tenant_id })
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<String>>()?;
|
||||
@@ -391,35 +362,6 @@ impl PageServerNode {
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> {
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
|
||||
.json(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
let timeline_infos: Vec<TimelineInfo> = self
|
||||
.http_request(
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
|
||||
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
|
||||
- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout.
|
||||
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
|
||||
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -27,4 +27,4 @@ management_token = jwt.encode({"scope": "pageserverapi"}, auth_keys.priv, algori
|
||||
tenant_token = jwt.encode({"scope": "tenant", "tenant_id": ps.initial_tenant}, auth_keys.priv, algorithm="RS256")
|
||||
```
|
||||
|
||||
Utility functions to work with jwts in rust are located in libs/utils/src/auth.rs
|
||||
Utility functions to work with jwts in rust are located in zenith_utils/src/auth.rs
|
||||
|
||||
@@ -29,7 +29,7 @@ Each Branch lives in a corresponding timeline[] and has an ancestor[].
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint;
|
||||
A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint;
|
||||
|
||||
### Checkpoint (Layered repository)
|
||||
|
||||
@@ -108,10 +108,10 @@ PostgreSQL LSNs and functions to monitor them:
|
||||
* `pg_current_wal_lsn()` - Returns the current write-ahead log write location.
|
||||
* `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location.
|
||||
* `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically.
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
|
||||
|
||||
Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
|
||||
Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md)
|
||||
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
|
||||
* `RestartLSN`: position in WAL confirmed by all safekeepers.
|
||||
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
|
||||
@@ -190,7 +190,7 @@ or we do not support them in zenith yet (pg_commit_ts).
|
||||
Tenant represents a single customer, interacting with Zenith.
|
||||
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
|
||||
One pageserver[] can serve multiple tenants at once.
|
||||
One safekeeper
|
||||
One safekeeper
|
||||
|
||||
See `docs/multitenancy.md` for more.
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ Init empty pageserver using `initdb` in temporary directory.
|
||||
|
||||
`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/.
|
||||
|
||||
Save`storage_dest` and other parameters in config.
|
||||
Save`storage_dest` and other parameters in config.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
|
||||
```
|
||||
@@ -21,7 +21,7 @@ zenith start
|
||||
```
|
||||
|
||||
#### 2. Restart pageserver (manually or crash-recovery).
|
||||
Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
|
||||
Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
|
||||
```
|
||||
@@ -32,7 +32,7 @@ zenith start
|
||||
Start pageserver from existing snapshot.
|
||||
Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...`
|
||||
Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation.
|
||||
Save`storage_dest` parameters in config.
|
||||
Save`storage_dest` parameters in config.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
```
|
||||
//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
|
||||
@@ -42,15 +42,15 @@ zenith start
|
||||
How to pass credentials needed for `snapshot_path`?
|
||||
|
||||
#### 4. Export.
|
||||
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
|
||||
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
|
||||
Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
|
||||
```
|
||||
zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
|
||||
```
|
||||
|
||||
#### Notes and questions
|
||||
- safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
|
||||
- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
|
||||
- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
|
||||
- We can think of better names for all options.
|
||||
- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
|
||||
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
|
||||
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
|
||||
@@ -1,151 +0,0 @@
|
||||
# Dispatching a connection
|
||||
|
||||
For each client connection, Neon service needs to authenticate the
|
||||
connection, and route it to the right PostgreSQL instance.
|
||||
|
||||
## Authentication
|
||||
|
||||
There are three different ways to authenticate:
|
||||
|
||||
- anonymous; no authentication needed
|
||||
- PostgreSQL authentication
|
||||
- github single sign-on using browser
|
||||
|
||||
In anonymous access, the user doesn't need to perform any
|
||||
authentication at all. This can be used e.g. in interactive PostgreSQL
|
||||
documentation, allowing you to run the examples very quickly. Similar
|
||||
to sqlfiddle.com.
|
||||
|
||||
PostgreSQL authentication works the same as always. All the different
|
||||
PostgreSQL authentication options like SCRAM, kerberos, etc. are
|
||||
available. [1]
|
||||
|
||||
The third option is to authenticate with github single sign-on. When
|
||||
you open the connection in psql, you get a link that you open with
|
||||
your browser. Opening the link redirects you to github authentication,
|
||||
and lets the connection to proceed. This is also known as "Link auth" [2].
|
||||
|
||||
|
||||
## Routing the connection
|
||||
|
||||
When a client starts a connection, it needs to be routed to the
|
||||
correct PostgreSQL instance. Routing can be done by the proxy, acting
|
||||
as a man-in-the-middle, or the connection can be routed at the network
|
||||
level based on the hostname or IP address.
|
||||
|
||||
Either way, Neon needs to identify which PostgreSQL instance the
|
||||
connection should be routed to. If the instance is not already
|
||||
running, it needs to be started. Some connections always require a new
|
||||
PostgreSQL instance to be created, e.g. if you want to run a one-off
|
||||
query against a particular point-in-time.
|
||||
|
||||
The PostgreSQL instance is identified by:
|
||||
- Neon account (possibly anonymous)
|
||||
- cluster (known as tenant in the storage?)
|
||||
- branch or snapshot name
|
||||
- timestamp (PITR)
|
||||
- primary or read-replica
|
||||
- one-off read replica
|
||||
- one-off writeable branch
|
||||
|
||||
When you are using regular PostgreSQL authentication or anonymous
|
||||
access, the connection URL needs to contain all the information needed
|
||||
for the routing. With github single sign-on, the browser is involved
|
||||
and some details - the Neon account in particular - can be deduced
|
||||
from the authentication exchange.
|
||||
|
||||
There are three methods for identifying the PostgreSQL instance:
|
||||
|
||||
- Browser interaction (link auth)
|
||||
- Options in the connection URL and the domain name
|
||||
- A pre-defined endpoint, identified by domain name or IP address
|
||||
|
||||
### Link Auth
|
||||
|
||||
postgres://<username>@start.neon.tech/<dbname>
|
||||
|
||||
This gives you a link that you open in browser. Clicking the link
|
||||
performs github authentication, and the Neon account name is
|
||||
provided to the proxy behind the scenes. The proxy routes the
|
||||
connection to the primary PostgreSQL instance in cluster called
|
||||
"main", branch "main".
|
||||
|
||||
Further ideas:
|
||||
- You could pre-define a different target for link auth
|
||||
connections in the UI.
|
||||
- You could have a drop-down in the browser, allowing you to connect
|
||||
to any cluster you want. Link Auth can be like Teleport.
|
||||
|
||||
### Connection URL
|
||||
|
||||
The connection URL looks like this:
|
||||
|
||||
postgres://<username>@<cluster-id>.db.neon.tech/<dbname>
|
||||
|
||||
By default, this connects you to the primary PostgreSQL instance
|
||||
running on the "main" branch in the named cluster [3]. However, you can
|
||||
change that by specifying options in the connection URL. The following
|
||||
options are supported:
|
||||
|
||||
| option name | Description | Examples |
|
||||
| --- | --- | --- |
|
||||
| cluster | Cluster name | cluster:myproject |
|
||||
| branch | Branch name | branch:main |
|
||||
| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z |
|
||||
| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 |
|
||||
| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new |
|
||||
|
||||
For example, to read branch 'testing' as it was on Mar 31, 2022, you could
|
||||
specify a timestamp in the connection URL [4]:
|
||||
|
||||
postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31
|
||||
|
||||
Connecting with cluster name and options can be disabled in the UI. If
|
||||
disabled, you can only connect using a pre-defined endpoint.
|
||||
|
||||
### Pre-defined Endpoint
|
||||
|
||||
Instead of providing the cluster name, branch, and all those options
|
||||
in the connection URL, you can define a named endpoint with the same
|
||||
options.
|
||||
|
||||
In the UI, click "create endpoint". Fill in the details:
|
||||
|
||||
- Cluster name
|
||||
- Branch
|
||||
- timestamp or LSN
|
||||
- is this for the primary or for a read replica
|
||||
- etc.
|
||||
|
||||
When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect:
|
||||
|
||||
postgres://<username>@<endpoint-id>.endpoint.neon.tech/<dbname>
|
||||
|
||||
|
||||
An endpoint can be assigned a static or dynamic IP address, so that
|
||||
you can connect to it with clients that don't support TLS SNI. Maybe
|
||||
bypass the proxy altogether, but that ought to be invisible to the
|
||||
user.
|
||||
|
||||
You can limit the range of source IP addresses that are allowed to
|
||||
connect to an endpoint. An endpoint can also be exposed in an Amazon
|
||||
VPC, allowing direct connections from applications.
|
||||
|
||||
|
||||
# Footnotes
|
||||
|
||||
[1] I'm not sure how feasible it is to set up configure like Kerberos
|
||||
or LDAP in a cloud environment. But in principle I think we should
|
||||
allow customers to have the full power of PostgreSQL, including all
|
||||
authentication options. However, it's up to the customer to configure
|
||||
it correctly.
|
||||
|
||||
[2] Link is a way to both authenticate and to route the connection
|
||||
|
||||
[3] This assumes that cluster-ids are globally unique, across all
|
||||
Neon accounts.
|
||||
|
||||
[4] The syntax accepted in the connection URL is limited by libpq. The
|
||||
only way to pass arbitrary options to the server (or our proxy) is
|
||||
with the "options" keyword, and the options must be percent-encoded. I
|
||||
think the above would work but i haven't tested it
|
||||
@@ -1,79 +0,0 @@
|
||||
Cluster size limits
|
||||
==================
|
||||
|
||||
## Summary
|
||||
|
||||
One of the resource consumption limits for free-tier users is a cluster size limit.
|
||||
|
||||
To enforce it, we need to calculate the timeline size and check if the limit is reached before relation create/extend operations.
|
||||
If the limit is reached, the query must fail with some meaningful error/warning.
|
||||
We may want to exempt some operations from the quota to allow users free space to fit back into the limit.
|
||||
|
||||
The stateless compute node that performs validation is separate from the storage that calculates the usage, so we need to exchange cluster size information between those components.
|
||||
|
||||
## Motivation
|
||||
|
||||
Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future).
|
||||
First of all, this is needed to control our free tier production costs.
|
||||
Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters,
|
||||
so we don't want to give users access to the functionality that we don't think is ready.
|
||||
|
||||
## Components
|
||||
|
||||
* pageserver - calculate the size consumed by a timeline and add it to the feedback message.
|
||||
* safekeeper - pass feedback message from pageserver to compute.
|
||||
* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`.
|
||||
* console - set and update `zenith.max_cluster_size` setting
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
First of all, it's necessary to define timeline size.
|
||||
|
||||
The current approach is to count all data, including SLRUs. (not including WAL)
|
||||
Here we think of it as a physical disk underneath the Postgres cluster.
|
||||
This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver.
|
||||
|
||||
Alternatively, we could count only relation data. As in pg_database_size().
|
||||
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
|
||||
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
|
||||
We will need to refactor the timeline_size counter or add another counter to implement it.
|
||||
|
||||
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
|
||||
Then this size should be reported to compute node.
|
||||
|
||||
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.`
|
||||
|
||||
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
||||
|
||||
This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`.
|
||||
|
||||
Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable.
|
||||
|
||||
And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so.
|
||||
(see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html))
|
||||
|
||||
TODO:
|
||||
We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`.
|
||||
It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level.
|
||||
See issues https://github.com/neondatabase/neon/issues/1245
|
||||
https://github.com/zenithdb/zenith/issues/1445
|
||||
|
||||
TODO:
|
||||
We should warn users if the limit is soon to be reached.
|
||||
|
||||
### **Reliability, failure modes and corner cases**
|
||||
|
||||
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
|
||||
|
||||
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
|
||||
|
||||
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
|
||||
|
||||
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
|
||||
|
||||
|
||||
### **Security implications**
|
||||
|
||||
We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM.
|
||||
Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check.
|
||||
To cover this case, we also monitor the compute node size in the console.
|
||||
@@ -74,10 +74,6 @@ Every `compaction_period` seconds, the page server checks if
|
||||
maintenance operations, like compaction, are needed on the layer
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### compaction_target_size
|
||||
|
||||
File sizes for L0 delta and L1 image layers. Default is 128MB.
|
||||
|
||||
#### gc_horizon
|
||||
|
||||
`gz_horizon` determines how much history is retained, to allow
|
||||
@@ -89,14 +85,6 @@ away.
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
|
||||
#### image_creation_threshold
|
||||
|
||||
L0 delta layer threshold for L1 iamge layer creation. Default is 3.
|
||||
|
||||
#### pitr_interval
|
||||
|
||||
WAL retention duration for PITR branching. Default is 30 days.
|
||||
|
||||
#### initial_superuser_name
|
||||
|
||||
Name of the initial superuser role, passed to initdb when a new tenant
|
||||
@@ -168,9 +156,6 @@ access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
|
||||
# Secret access key to connect to the bucket ("password" part of the credentials)
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
|
||||
# S3 API query limit to avoid getting errors/throttling from AWS.
|
||||
concurrency_limit = 100
|
||||
```
|
||||
|
||||
###### General remote storage configuration
|
||||
@@ -182,8 +167,8 @@ Besides, there are parameters common for all types of remote storage that can be
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
# Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time.
|
||||
max_concurrent_timelines_sync = 50
|
||||
# Max number of concurrent connections to open for uploading to or downloading from the remote storage.
|
||||
max_concurrent_sync = 100
|
||||
|
||||
# Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
|
||||
max_sync_errors = 10
|
||||
|
||||
@@ -28,7 +28,12 @@ The pageserver has a few different duties:
|
||||
- Receive WAL from the WAL service and decode it.
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
|
||||
For more detailed info, see [/pageserver/README](/pageserver/README.md)
|
||||
For more detailed info, see `/pageserver/README`
|
||||
|
||||
`/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/proxy`:
|
||||
|
||||
@@ -52,12 +57,12 @@ PostgreSQL extension that implements storage manager API and network communicati
|
||||
|
||||
PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
|
||||
`/safekeeper`:
|
||||
`/walkeeper`:
|
||||
|
||||
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
It acts as a holding area and redistribution center for recently generated WAL.
|
||||
|
||||
For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
|
||||
For more detailed info, see `/walkeeper/README`
|
||||
|
||||
`/workspace_hack`:
|
||||
The workspace_hack crate exists only to pin down some dependencies.
|
||||
@@ -69,21 +74,14 @@ We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation.
|
||||
Main entry point for the 'zenith' CLI utility.
|
||||
TODO: Doesn't it belong to control_plane?
|
||||
|
||||
`/libs`:
|
||||
Unites granular neon helper crates under the hood.
|
||||
`/zenith_metrics`:
|
||||
|
||||
`/libs/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/libs/utils`:
|
||||
Generic helpers that are shared between other crates in this repository.
|
||||
A subject for future modularization.
|
||||
|
||||
`/libs/metrics`:
|
||||
Helpers for exposing Prometheus metrics from the server.
|
||||
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
|
||||
@@ -3,14 +3,6 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[features]
|
||||
# It is simpler infra-wise to have failpoints enabled by default
|
||||
# It shouldnt affect perf in any way because failpoints
|
||||
# are not placed in hot code paths
|
||||
default = ["failpoints"]
|
||||
profiling = ["pprof"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
@@ -22,14 +14,15 @@ hex = "0.4.3"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "3.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
tokio-stream = "0.1.8"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
crc32c = "0.6.0"
|
||||
@@ -39,14 +32,12 @@ humantime = "2.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
humantime-serde = "1.1.1"
|
||||
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
scopeguard = "1.1.0"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
tracing-futures = "0.2"
|
||||
signal-hook = "0.3.10"
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
@@ -57,10 +48,11 @@ fail = "0.5.0"
|
||||
rusoto_core = "0.47"
|
||||
rusoto_s3 = "0.47"
|
||||
async-trait = "0.1"
|
||||
async-compression = {version = "0.3", features = ["zstd", "tokio"]}
|
||||
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -12,20 +12,20 @@
|
||||
//!
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use log::*;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::reltag::SlruKind;
|
||||
use crate::repository::Timeline;
|
||||
use crate::DatadirTimelineImpl;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
@@ -154,17 +154,9 @@ impl<'a> Basebackup<'a> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
|
||||
ensure!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(
|
||||
img.len() == pg_constants::BLCKSZ as usize
|
||||
|| img.len() == pg_constants::BLCKSZ as usize + 8
|
||||
);
|
||||
} else {
|
||||
ensure!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
}
|
||||
|
||||
slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]);
|
||||
slru_buf.extend_from_slice(&img);
|
||||
}
|
||||
|
||||
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
|
||||
|
||||
@@ -7,7 +7,7 @@ use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use pageserver::page_cache;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use utils::GIT_VERSION;
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
|
||||
@@ -2,45 +2,38 @@
|
||||
|
||||
use std::{env, path::Path, str::FromStr};
|
||||
use tracing::*;
|
||||
use zenith_utils::{
|
||||
auth::JwtAuth,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
tcp_listener,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
GIT_VERSION,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use fail::FailScenario;
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, tenant_mgr, thread_mgr,
|
||||
http, page_cache, page_service,
|
||||
remote_storage::{self, SyncStartupData},
|
||||
repository::{Repository, TimelineSyncStatusUpdate},
|
||||
tenant_mgr, thread_mgr,
|
||||
thread_mgr::ThreadKind,
|
||||
timelines, virtual_file, LOG_FILE_NAME,
|
||||
};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::endpoint,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
shutdown::exit_now,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
GIT_VERSION,
|
||||
};
|
||||
|
||||
fn version() -> String {
|
||||
format!(
|
||||
"{} profiling:{} failpoints:{}",
|
||||
GIT_VERSION,
|
||||
cfg!(feature = "profiling"),
|
||||
fail::has_failpoints()
|
||||
)
|
||||
}
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::shutdown::exit_now;
|
||||
use zenith_utils::signals::{self, Signal};
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
metrics::set_common_metrics_prefix("pageserver");
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(&*version())
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
@@ -85,23 +78,8 @@ fn main() -> anyhow::Result<()> {
|
||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
|
||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enabled-features")
|
||||
.long("enabled-features")
|
||||
.takes_value(false)
|
||||
.help("Show enabled compile time features"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
if arg_matches.is_present("enabled-features") {
|
||||
let features: &[&str] = &[
|
||||
#[cfg(feature = "failpoints")]
|
||||
"failpoints",
|
||||
];
|
||||
println!("{{\"features\": {features:?} }}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let workdir = workdir
|
||||
.canonicalize()
|
||||
@@ -182,14 +160,6 @@ fn main() -> anyhow::Result<()> {
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// If failpoints are used, terminate the whole pageserver process if they are hit.
|
||||
let scenario = FailScenario::setup();
|
||||
if fail::has_failpoints() {
|
||||
std::panic::set_hook(Box::new(|_| {
|
||||
std::process::exit(1);
|
||||
}));
|
||||
}
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
@@ -205,12 +175,10 @@ fn main() -> anyhow::Result<()> {
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")
|
||||
}
|
||||
|
||||
scenario.teardown();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
|
||||
@@ -263,8 +231,46 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
// Initialize repositories with locally available timelines.
|
||||
// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
// are scheduled for download and added to the repository once download is completed.
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = remote_storage::start_local_timeline_sync(conf)
|
||||
.context("Failed to set up local files sync with external storage")?;
|
||||
|
||||
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
|
||||
// initialize local tenant
|
||||
let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index);
|
||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||
match init_status {
|
||||
remote_storage::LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {} for tenant {} is locally complete, registering it in repository", tenant_id, timeline_id);
|
||||
// Lets fail here loudly to be on the safe side.
|
||||
// XXX: It may be a better api to actually distinguish between repository startup
|
||||
// and processing of newly downloaded timelines.
|
||||
repo.apply_timeline_remote_sync_status_update(
|
||||
timeline_id,
|
||||
TimelineSyncStatusUpdate::Downloaded,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to bootstrap timeline {} for tenant {}",
|
||||
timeline_id, tenant_id
|
||||
)
|
||||
})?
|
||||
}
|
||||
remote_storage::LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
"timeline {} for tenant {} needs sync, \
|
||||
so skipped for adding into repository until sync is finished",
|
||||
tenant_id, timeline_id
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
@@ -277,8 +283,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
let remote_index = tenant_mgr::init_tenant_mgr(conf)?;
|
||||
|
||||
// Spawn a new thread for the http endpoint
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
let auth_cloned = auth.clone();
|
||||
@@ -289,7 +293,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"http_endpoint_thread",
|
||||
false,
|
||||
move || {
|
||||
let router = http::make_router(conf, auth_cloned, remote_index)?;
|
||||
let router = http::make_router(conf, auth_cloned, remote_index);
|
||||
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
|
||||
},
|
||||
)?;
|
||||
@@ -311,7 +315,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
std::process::exit(111);
|
||||
}
|
||||
|
||||
@@ -320,8 +323,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
pageserver::shutdown_pageserver(0);
|
||||
pageserver::shutdown_pageserver();
|
||||
unreachable!()
|
||||
}
|
||||
})
|
||||
|
||||
334
pageserver/src/bin/pageserver_zst.rs
Normal file
334
pageserver/src/bin/pageserver_zst.rs
Normal file
@@ -0,0 +1,334 @@
|
||||
//! A CLI helper to deal with remote storage (S3, usually) blobs as archives.
|
||||
//! See [`compression`] for more details about the archives.
|
||||
|
||||
use std::{collections::BTreeSet, path::Path};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use clap::{App, Arg};
|
||||
use pageserver::{
|
||||
layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
remote_storage::compression,
|
||||
};
|
||||
use tokio::{fs, io};
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
const LIST_SUBCOMMAND: &str = "list";
|
||||
const ARCHIVE_ARG_NAME: &str = "archive";
|
||||
|
||||
const EXTRACT_SUBCOMMAND: &str = "extract";
|
||||
const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory";
|
||||
|
||||
const CREATE_SUBCOMMAND: &str = "create";
|
||||
const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory";
|
||||
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("pageserver zst blob [un]compressor utility")
|
||||
.version(GIT_VERSION)
|
||||
.subcommands(vec![
|
||||
App::new(LIST_SUBCOMMAND)
|
||||
.about("List the archive contents")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to list the contents of"),
|
||||
),
|
||||
App::new(EXTRACT_SUBCOMMAND)
|
||||
.about("Extracts the archive into the directory")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to extract"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to extract the archive into. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
App::new(CREATE_SUBCOMMAND)
|
||||
.about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)")
|
||||
.arg(
|
||||
Arg::new(SOURCE_DIRECTORY_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("A directory to use for creating the archive"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to create the archive in. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
])
|
||||
.get_matches();
|
||||
|
||||
let subcommand_name = match arg_matches.subcommand_name() {
|
||||
Some(name) => name,
|
||||
None => bail!("No subcommand specified"),
|
||||
};
|
||||
|
||||
let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) {
|
||||
Some(matches) => matches,
|
||||
None => bail!(
|
||||
"No subcommand arguments were recognized for subcommand '{}'",
|
||||
subcommand_name
|
||||
),
|
||||
};
|
||||
|
||||
let target_dir = Path::new(
|
||||
subcommand_matches
|
||||
.value_of(TARGET_DIRECTORY_ARG_NAME)
|
||||
.unwrap_or("./"),
|
||||
);
|
||||
|
||||
match subcommand_name {
|
||||
LIST_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
list_archive(archive).await
|
||||
}
|
||||
EXTRACT_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
extract_archive(archive, target_dir).await
|
||||
}
|
||||
CREATE_SUBCOMMAND => {
|
||||
let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) {
|
||||
Some(source) => Path::new(source),
|
||||
None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME),
|
||||
};
|
||||
create_archive(source_dir, target_dir).await
|
||||
}
|
||||
unknown => bail!("Unknown subcommand {}", unknown),
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_archive(archive: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
println!("Listing an archive at path '{}'", archive.display());
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
let archive_bytes = fs::read(&archive)
|
||||
.await
|
||||
.context("Failed to read the archive bytes")?;
|
||||
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice())
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
|
||||
let empty_path = Path::new("");
|
||||
println!("-------------------------------");
|
||||
|
||||
let longest_path_in_archive = header
|
||||
.files
|
||||
.iter()
|
||||
.filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len()))
|
||||
.max()
|
||||
.unwrap_or_default()
|
||||
.max(METADATA_FILE_NAME.len());
|
||||
|
||||
for regular_file in &header.files {
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
regular_file.subpath.as_path(empty_path).display(),
|
||||
regular_file.size,
|
||||
width = longest_path_in_archive,
|
||||
)
|
||||
}
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
METADATA_FILE_NAME,
|
||||
header.metadata_file_size,
|
||||
width = longest_path_in_archive,
|
||||
);
|
||||
println!("-------------------------------");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
let mut dir_contents = fs::read_dir(&target_dir)
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
let dir_entry = dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
ensure!(
|
||||
dir_entry.is_none(),
|
||||
"Target directory '{}' is not empty",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Extracting an archive at path '{}' into directory '{}'",
|
||||
archive.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut archive_file = fs::File::open(&archive).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to extract the archive")
|
||||
}
|
||||
|
||||
async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let source_dir = source_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the source dir path '{}'",
|
||||
source_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
source_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
source_dir.display()
|
||||
);
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Compressing directory '{}' and creating resulting archive in directory '{}'",
|
||||
source_dir.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut metadata_file_contents = None;
|
||||
let mut files_co_archive = Vec::new();
|
||||
|
||||
let mut source_dir_contents = fs::read_dir(&source_dir)
|
||||
.await
|
||||
.context("Failed to read the source directory contents")?;
|
||||
|
||||
while let Some(source_dir_entry) = source_dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to read a source dir entry")?
|
||||
{
|
||||
let entry_path = source_dir_entry.path();
|
||||
if entry_path.is_file() {
|
||||
if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) {
|
||||
let metadata_bytes = fs::read(entry_path)
|
||||
.await
|
||||
.context("Failed to read metata file bytes in the source dir")?;
|
||||
metadata_file_contents = Some(
|
||||
TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.context("Failed to parse metata file contents in the source dir")?,
|
||||
);
|
||||
} else {
|
||||
files_co_archive.push(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let metadata = match metadata_file_contents {
|
||||
Some(metadata) => metadata,
|
||||
None => bail!(
|
||||
"No metadata file found in the source dir '{}', cannot create the archive",
|
||||
source_dir.display()
|
||||
),
|
||||
};
|
||||
|
||||
let _ = compression::archive_files_as_stream(
|
||||
&source_dir,
|
||||
files_co_archive.iter(),
|
||||
&metadata,
|
||||
move |mut archive_streamer, archive_name| async move {
|
||||
let archive_target = target_dir.join(&archive_name);
|
||||
let mut archive_file = fs::File::create(&archive_target).await?;
|
||||
io::copy(&mut archive_streamer, &mut archive_file).await?;
|
||||
Ok(archive_target)
|
||||
},
|
||||
)
|
||||
.await
|
||||
.context("Failed to create an archive")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,75 +0,0 @@
|
||||
use std::str::FromStr;
|
||||
|
||||
use anyhow::Result;
|
||||
use postgres_ffi::{pg_constants::WAL_SEGMENT_SIZE, waldecoder::WalStreamDecoder};
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use tokio::net::TcpStream;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
|
||||
struct PageServiceApi {
|
||||
stream: TcpStream,
|
||||
}
|
||||
|
||||
impl PageServiceApi {
|
||||
async fn connect(tenant: &ZTenantId, timeline: &ZTimelineId, connstr: &str) -> Result<Self> {
|
||||
let mut stream = TcpStream::connect("localhost:15000").await?;
|
||||
|
||||
// Connect to pageserver
|
||||
// TODO read host, port, dbname, user from command line
|
||||
let (client, conn) = tokio_postgres::Config::new()
|
||||
.host("127.0.0.1")
|
||||
.port(15000)
|
||||
.dbname("postgres")
|
||||
.user("zenith_admin")
|
||||
.connect_raw(&mut stream, tokio_postgres::NoTls)
|
||||
.await?;
|
||||
|
||||
let init_query = format!("callmemaybe {} {} {}", tenant, timeline, connstr);
|
||||
tokio::select! {
|
||||
_ = conn => panic!("connection closed during callmemaybe"),
|
||||
_ = client.query(init_query.as_str(), &[]) => (),
|
||||
};
|
||||
|
||||
Ok(Self { stream })
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
use clap::{App, Arg};
|
||||
let arg_matches = App::new("Replay")
|
||||
.arg(
|
||||
Arg::new("tenant")
|
||||
.long("tenant")
|
||||
.takes_value(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("timeline")
|
||||
.long("timeline")
|
||||
.takes_value(true)
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let partial_path = "/home/bojan/tmp/sk_wal";
|
||||
let startpos = Lsn(23761464); // I got this by grepping sk log for "restart decoder"
|
||||
let xlogoff: usize = startpos.segment_offset(WAL_SEGMENT_SIZE);
|
||||
|
||||
let mut decoder = WalStreamDecoder::new(startpos);
|
||||
let bytes = std::fs::read(partial_path)?;
|
||||
decoder.feed_bytes(&bytes[xlogoff..(xlogoff+10000)]);
|
||||
|
||||
while let Some((lsn, rec)) = decoder.poll_decode()? {
|
||||
println!("lsn: {}", lsn);
|
||||
}
|
||||
|
||||
// TODO start replication server, get connstr
|
||||
|
||||
let tenant = ZTenantId::from_str(arg_matches.value_of("tenant").unwrap())?;
|
||||
let timeline = ZTimelineId::from_str(arg_matches.value_of("timeline").unwrap())?;
|
||||
let connstr = "lol";
|
||||
let mut api = PageServiceApi::connect(&tenant, &timeline, connstr).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6,7 +6,8 @@ use clap::{App, Arg};
|
||||
use pageserver::layered_repository::metadata::TimelineMetadata;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use utils::{lsn::Lsn, GIT_VERSION};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith update metadata utility")
|
||||
|
||||
@@ -4,24 +4,22 @@
|
||||
//! file, or on the command line.
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
||||
|
||||
use std::convert::TryInto;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use utils::{
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::layered_repository::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant_config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
@@ -29,22 +27,27 @@ pub mod defaults {
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: &str = "100 s";
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 10;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
@@ -59,6 +62,13 @@ pub mod defaults {
|
||||
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
|
||||
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
|
||||
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
|
||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||
|
||||
@@ -67,17 +77,6 @@ pub mod defaults {
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
|
||||
#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
|
||||
|
||||
# [remote_storage]
|
||||
|
||||
"###
|
||||
@@ -95,6 +94,22 @@ pub struct PageServerConf {
|
||||
/// Example (default): 127.0.0.1:9898
|
||||
pub listen_http_addr: String,
|
||||
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub checkpoint_distance: u64,
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub compaction_target_size: u64,
|
||||
|
||||
// How often to check if there's compaction work to be done.
|
||||
pub compaction_period: Duration,
|
||||
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
pub wait_lsn_timeout: Duration,
|
||||
// How long to wait for WAL redo to complete.
|
||||
@@ -119,28 +134,6 @@ pub struct PageServerConf {
|
||||
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
|
||||
pub profiling: ProfilingConfig,
|
||||
pub default_tenant_conf: TenantConf,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum ProfilingConfig {
|
||||
Disabled,
|
||||
PageRequests,
|
||||
}
|
||||
|
||||
impl FromStr for ProfilingConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
|
||||
let result = match s {
|
||||
"disabled" => ProfilingConfig::Disabled,
|
||||
"page_requests" => ProfilingConfig::PageRequests,
|
||||
_ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
// use dedicated enum for builder to better indicate the intention
|
||||
@@ -165,6 +158,14 @@ struct PageServerConfigBuilder {
|
||||
|
||||
listen_http_addr: BuilderValue<String>,
|
||||
|
||||
checkpoint_distance: BuilderValue<u64>,
|
||||
|
||||
compaction_target_size: BuilderValue<u64>,
|
||||
compaction_period: BuilderValue<Duration>,
|
||||
|
||||
gc_horizon: BuilderValue<u64>,
|
||||
gc_period: BuilderValue<Duration>,
|
||||
|
||||
wait_lsn_timeout: BuilderValue<Duration>,
|
||||
wal_redo_timeout: BuilderValue<Duration>,
|
||||
|
||||
@@ -184,8 +185,6 @@ struct PageServerConfigBuilder {
|
||||
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
|
||||
id: BuilderValue<ZNodeId>,
|
||||
|
||||
profiling: BuilderValue<ProfilingConfig>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -195,6 +194,13 @@ impl Default for PageServerConfigBuilder {
|
||||
Self {
|
||||
listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||
listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||
checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
|
||||
compaction_target_size: Set(DEFAULT_COMPACTION_TARGET_SIZE),
|
||||
compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period")),
|
||||
gc_horizon: Set(DEFAULT_GC_HORIZON),
|
||||
gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period")),
|
||||
wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
|
||||
.expect("cannot parse default wait lsn timeout")),
|
||||
wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
@@ -210,7 +216,6 @@ impl Default for PageServerConfigBuilder {
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
profiling: Set(ProfilingConfig::Disabled),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -224,6 +229,26 @@ impl PageServerConfigBuilder {
|
||||
self.listen_http_addr = BuilderValue::Set(listen_http_addr)
|
||||
}
|
||||
|
||||
pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) {
|
||||
self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
|
||||
}
|
||||
|
||||
pub fn compaction_target_size(&mut self, compaction_target_size: u64) {
|
||||
self.compaction_target_size = BuilderValue::Set(compaction_target_size)
|
||||
}
|
||||
|
||||
pub fn compaction_period(&mut self, compaction_period: Duration) {
|
||||
self.compaction_period = BuilderValue::Set(compaction_period)
|
||||
}
|
||||
|
||||
pub fn gc_horizon(&mut self, gc_horizon: u64) {
|
||||
self.gc_horizon = BuilderValue::Set(gc_horizon)
|
||||
}
|
||||
|
||||
pub fn gc_period(&mut self, gc_period: Duration) {
|
||||
self.gc_period = BuilderValue::Set(gc_period)
|
||||
}
|
||||
|
||||
pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
|
||||
self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
|
||||
}
|
||||
@@ -271,46 +296,52 @@ impl PageServerConfigBuilder {
|
||||
self.id = BuilderValue::Set(node_id)
|
||||
}
|
||||
|
||||
pub fn profiling(&mut self, profiling: ProfilingConfig) {
|
||||
self.profiling = BuilderValue::Set(profiling)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<PageServerConf> {
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
.ok_or(anyhow!("missing listen_pg_addr"))?,
|
||||
.ok_or(anyhow::anyhow!("missing listen_pg_addr"))?,
|
||||
listen_http_addr: self
|
||||
.listen_http_addr
|
||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
||||
.ok_or(anyhow::anyhow!("missing listen_http_addr"))?,
|
||||
checkpoint_distance: self
|
||||
.checkpoint_distance
|
||||
.ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
|
||||
compaction_target_size: self
|
||||
.compaction_target_size
|
||||
.ok_or(anyhow::anyhow!("missing compaction_target_size"))?,
|
||||
compaction_period: self
|
||||
.compaction_period
|
||||
.ok_or(anyhow::anyhow!("missing compaction_period"))?,
|
||||
gc_horizon: self
|
||||
.gc_horizon
|
||||
.ok_or(anyhow::anyhow!("missing gc_horizon"))?,
|
||||
gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?,
|
||||
wait_lsn_timeout: self
|
||||
.wait_lsn_timeout
|
||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
||||
.ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?,
|
||||
wal_redo_timeout: self
|
||||
.wal_redo_timeout
|
||||
.ok_or(anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
|
||||
.ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?,
|
||||
page_cache_size: self
|
||||
.page_cache_size
|
||||
.ok_or(anyhow!("missing page_cache_size"))?,
|
||||
.ok_or(anyhow::anyhow!("missing page_cache_size"))?,
|
||||
max_file_descriptors: self
|
||||
.max_file_descriptors
|
||||
.ok_or(anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
|
||||
.ok_or(anyhow::anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?,
|
||||
pg_distrib_dir: self
|
||||
.pg_distrib_dir
|
||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
||||
auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?,
|
||||
.ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?,
|
||||
auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?,
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
||||
.ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?,
|
||||
remote_storage_config: self
|
||||
.remote_storage_config
|
||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
||||
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
.ok_or(anyhow::anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow::anyhow!("missing id"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -319,7 +350,7 @@ impl PageServerConfigBuilder {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between pageserver and the remote storage.
|
||||
pub max_concurrent_timelines_sync: NonZeroUsize,
|
||||
pub max_concurrent_sync: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
@@ -360,9 +391,6 @@ pub struct S3Config {
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
@@ -371,7 +399,6 @@ impl std::fmt::Debug for S3Config {
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
@@ -417,12 +444,17 @@ impl PageServerConf {
|
||||
let mut builder = PageServerConfigBuilder::default();
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
|
||||
for (key, item) in toml.iter() {
|
||||
match key {
|
||||
"listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
|
||||
"listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
|
||||
"checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
|
||||
"compaction_target_size" => {
|
||||
builder.compaction_target_size(parse_toml_u64(key, item)?)
|
||||
}
|
||||
"compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?),
|
||||
"gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
|
||||
"gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
|
||||
"wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
|
||||
"wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
|
||||
"initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
|
||||
@@ -436,16 +468,12 @@ impl PageServerConf {
|
||||
"auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
|
||||
PathBuf::from(parse_toml_string(key, item)?),
|
||||
)),
|
||||
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
||||
"auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?),
|
||||
"remote_storage" => {
|
||||
builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
|
||||
}
|
||||
"tenant_config" => {
|
||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||
}
|
||||
"id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
|
||||
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
_ => bail!("unrecognized pageserver option '{}'", key),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -471,75 +499,41 @@ impl PageServerConf {
|
||||
);
|
||||
}
|
||||
|
||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
// subroutine of parse_and_validate to parse `[tenant_conf]` section
|
||||
|
||||
pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
|
||||
t_conf.checkpoint_distance =
|
||||
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_target_size) = item.get("compaction_target_size") {
|
||||
t_conf.compaction_target_size = Some(parse_toml_u64(
|
||||
"compaction_target_size",
|
||||
compaction_target_size,
|
||||
)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_period) = item.get("compaction_period") {
|
||||
t_conf.compaction_period =
|
||||
Some(parse_toml_duration("compaction_period", compaction_period)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_threshold) = item.get("compaction_threshold") {
|
||||
t_conf.compaction_threshold =
|
||||
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
|
||||
}
|
||||
|
||||
if let Some(gc_horizon) = item.get("gc_horizon") {
|
||||
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
||||
}
|
||||
|
||||
if let Some(gc_period) = item.get("gc_period") {
|
||||
t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
|
||||
}
|
||||
|
||||
if let Some(pitr_interval) = item.get("pitr_interval") {
|
||||
t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
/// subroutine of parse_config(), to parse the `[remote_storage]` table.
|
||||
fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
let bucket_region = toml.get("bucket_region");
|
||||
|
||||
let max_concurrent_timelines_sync = NonZeroUsize::new(
|
||||
parse_optional_integer("max_concurrent_timelines_sync", toml)?
|
||||
.unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC),
|
||||
)
|
||||
.context("Failed to parse 'max_concurrent_timelines_sync' as a positive integer")?;
|
||||
|
||||
let max_sync_errors = NonZeroU32::new(
|
||||
parse_optional_integer("max_sync_errors", toml)?
|
||||
.unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
||||
)
|
||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
||||
|
||||
let concurrency_limit = NonZeroUsize::new(
|
||||
parse_optional_integer("concurrency_limit", toml)?
|
||||
.unwrap_or(defaults::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
let max_concurrent_sync: NonZeroUsize = if let Some(s) = toml.get("max_concurrent_sync") {
|
||||
parse_toml_u64("max_concurrent_sync", s)
|
||||
.and_then(|toml_u64| {
|
||||
toml_u64.try_into().with_context(|| {
|
||||
format!("'max_concurrent_sync' value {} is too large", toml_u64)
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.and_then(NonZeroUsize::new)
|
||||
.context("'max_concurrent_sync' must be a non-zero positive integer")?
|
||||
} else {
|
||||
NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap()
|
||||
};
|
||||
let max_sync_errors: NonZeroU32 = if let Some(s) = toml.get("max_sync_errors") {
|
||||
parse_toml_u64("max_sync_errors", s)
|
||||
.and_then(|toml_u64| {
|
||||
toml_u64.try_into().with_context(|| {
|
||||
format!("'max_sync_errors' value {} is too large", toml_u64)
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.and_then(NonZeroU32::new)
|
||||
.context("'max_sync_errors' must be a non-zero positive integer")?
|
||||
} else {
|
||||
NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap()
|
||||
};
|
||||
|
||||
let storage = match (local_path, bucket_name, bucket_region) {
|
||||
(None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
|
||||
@@ -570,7 +564,6 @@ impl PageServerConf {
|
||||
.get("endpoint")
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
}),
|
||||
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
|
||||
parse_toml_string("local_path", local_path)?,
|
||||
@@ -579,7 +572,7 @@ impl PageServerConf {
|
||||
};
|
||||
|
||||
Ok(RemoteStorageConfig {
|
||||
max_concurrent_timelines_sync,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
})
|
||||
@@ -587,13 +580,18 @@ impl PageServerConf {
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> PathBuf {
|
||||
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||
PathBuf::from(format!("../tmp_check/test_{}", test_name))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||
PageServerConf {
|
||||
id: ZNodeId(0),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: 4 * 1024 * 1024,
|
||||
compaction_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
wal_redo_timeout: Duration::from_secs(60),
|
||||
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
|
||||
@@ -606,8 +604,6 @@ impl PageServerConf {
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::dummy_conf(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -617,7 +613,7 @@ impl PageServerConf {
|
||||
fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
|
||||
let s = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
Ok(s.to_string())
|
||||
}
|
||||
|
||||
@@ -626,46 +622,26 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
|
||||
// for our use, though.
|
||||
let i: i64 = item
|
||||
.as_integer()
|
||||
.with_context(|| format!("configure option {name} is not an integer"))?;
|
||||
.with_context(|| format!("configure option {} is not an integer", name))?;
|
||||
if i < 0 {
|
||||
bail!("configure option {name} cannot be negative");
|
||||
bail!("configure option {} cannot be negative", name);
|
||||
}
|
||||
Ok(i as u64)
|
||||
}
|
||||
|
||||
fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
|
||||
where
|
||||
I: TryFrom<i64, Error = E>,
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
let toml_integer = match item.get(name) {
|
||||
Some(item) => item
|
||||
.as_integer()
|
||||
.with_context(|| format!("configure option {name} is not an integer"))?,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
I::try_from(toml_integer)
|
||||
.map(Some)
|
||||
.with_context(|| format!("configure option {name} is too large"))
|
||||
}
|
||||
|
||||
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
|
||||
let s = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
|
||||
Ok(humantime::parse_duration(s)?)
|
||||
}
|
||||
|
||||
fn parse_toml_from_str<T>(name: &str, item: &Item) -> Result<T>
|
||||
where
|
||||
T: FromStr<Err = anyhow::Error>,
|
||||
{
|
||||
fn parse_toml_auth_type(name: &str, item: &Item) -> Result<AuthType> {
|
||||
let v = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
T::from_str(v)
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
AuthType::from_str(v)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -682,6 +658,14 @@ mod tests {
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
|
||||
checkpoint_distance = 111 # in bytes
|
||||
|
||||
compaction_target_size = 111 # in bytes
|
||||
compaction_period = '111 s'
|
||||
|
||||
gc_period = '222 s'
|
||||
gc_horizon = 222
|
||||
|
||||
wait_lsn_timeout = '111 s'
|
||||
wal_redo_timeout = '111 s'
|
||||
|
||||
@@ -702,8 +686,10 @@ id = 10
|
||||
let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display());
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"));
|
||||
let parsed_config =
|
||||
PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
parsed_config,
|
||||
@@ -711,6 +697,11 @@ id = 10
|
||||
id: ZNodeId(10),
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: defaults::DEFAULT_COMPACTION_TARGET_SIZE,
|
||||
compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
|
||||
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
|
||||
wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
|
||||
superuser: defaults::DEFAULT_SUPERUSER.to_string(),
|
||||
@@ -721,8 +712,6 @@ id = 10
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -736,13 +725,16 @@ id = 10
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
|
||||
let config_string = format!(
|
||||
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'",
|
||||
"{}pg_distrib_dir='{}'",
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"));
|
||||
let parsed_config =
|
||||
PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
parsed_config,
|
||||
@@ -750,6 +742,11 @@ id = 10
|
||||
id: ZNodeId(10),
|
||||
listen_pg_addr: "127.0.0.1:64000".to_string(),
|
||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||
checkpoint_distance: 111,
|
||||
compaction_target_size: 111,
|
||||
compaction_period: Duration::from_secs(111),
|
||||
gc_horizon: 222,
|
||||
gc_period: Duration::from_secs(222),
|
||||
wait_lsn_timeout: Duration::from_secs(111),
|
||||
wal_redo_timeout: Duration::from_secs(111),
|
||||
superuser: "zzzz".to_string(),
|
||||
@@ -760,8 +757,6 @@ id = 10
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -790,33 +785,37 @@ local_path = '{}'"#,
|
||||
|
||||
for remote_storage_config_str in identical_toml_declarations {
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
r#"{}
|
||||
pg_distrib_dir='{}'
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
{}"#,
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display(),
|
||||
remote_storage_config_str,
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"))
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for the local FS");
|
||||
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_timelines_sync: NonZeroUsize::new(
|
||||
defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_TIMELINES_SYNC
|
||||
)
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_sync: NonZeroUsize::new(
|
||||
defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -832,49 +831,52 @@ pg_distrib_dir='{}'
|
||||
let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
|
||||
let secret_access_key = "SOMEsEcReTsd292v".to_string();
|
||||
let endpoint = "http://localhost:5000".to_string();
|
||||
let max_concurrent_timelines_sync = NonZeroUsize::new(111).unwrap();
|
||||
let max_concurrent_sync = NonZeroUsize::new(111).unwrap();
|
||||
let max_sync_errors = NonZeroU32::new(222).unwrap();
|
||||
let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
|
||||
|
||||
let identical_toml_declarations = &[
|
||||
format!(
|
||||
r#"[remote_storage]
|
||||
max_concurrent_timelines_sync = {max_concurrent_timelines_sync}
|
||||
max_sync_errors = {max_sync_errors}
|
||||
bucket_name = '{bucket_name}'
|
||||
bucket_region = '{bucket_region}'
|
||||
prefix_in_bucket = '{prefix_in_bucket}'
|
||||
access_key_id = '{access_key_id}'
|
||||
secret_access_key = '{secret_access_key}'
|
||||
endpoint = '{endpoint}'
|
||||
concurrency_limit = {s3_concurrency_limit}"#
|
||||
max_concurrent_sync = {}
|
||||
max_sync_errors = {}
|
||||
bucket_name = '{}'
|
||||
bucket_region = '{}'
|
||||
prefix_in_bucket = '{}'
|
||||
access_key_id = '{}'
|
||||
secret_access_key = '{}'
|
||||
endpoint = '{}'"#,
|
||||
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
|
||||
),
|
||||
format!(
|
||||
"remote_storage={{max_concurrent_timelines_sync={max_concurrent_timelines_sync}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
|
||||
bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', access_key_id='{access_key_id}', secret_access_key='{secret_access_key}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
|
||||
"remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}",
|
||||
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
|
||||
),
|
||||
];
|
||||
|
||||
for remote_storage_config_str in identical_toml_declarations {
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
r#"{}
|
||||
pg_distrib_dir='{}'
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
{}"#,
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display(),
|
||||
remote_storage_config_str,
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e}"))
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for S3");
|
||||
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_timelines_sync,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: bucket_name.clone(),
|
||||
@@ -882,8 +884,7 @@ pg_distrib_dir='{}'
|
||||
access_key_id: Some(access_key_id.clone()),
|
||||
secret_access_key: Some(secret_access_key.clone()),
|
||||
prefix_in_bucket: Some(prefix_in_bucket.clone()),
|
||||
endpoint: Some(endpoint.clone()),
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
endpoint: Some(endpoint.clone())
|
||||
}),
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::{
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
@@ -20,19 +20,11 @@ pub struct TimelineCreateRequest {
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Default)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub new_tenant_id: Option<ZTenantId>,
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
pub pitr_interval: Option<String>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -44,44 +36,3 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId
|
||||
pub struct StatusResponse {
|
||||
pub id: ZNodeId,
|
||||
}
|
||||
|
||||
impl TenantCreateRequest {
|
||||
pub fn new(new_tenant_id: Option<ZTenantId>) -> TenantCreateRequest {
|
||||
TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantConfigRequest {
|
||||
pub tenant_id: ZTenantId,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
pub pitr_interval: Option<String>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest {
|
||||
TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: None,
|
||||
compaction_target_size: None,
|
||||
compaction_period: None,
|
||||
compaction_threshold: None,
|
||||
gc_horizon: None,
|
||||
gc_period: None,
|
||||
image_creation_threshold: None,
|
||||
pitr_interval: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -328,7 +328,11 @@ paths:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantCreateInfo"
|
||||
type: object
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
responses:
|
||||
"201":
|
||||
description: New tenant created successfully
|
||||
@@ -367,48 +371,7 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
description: |
|
||||
Update tenant's config.
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Malformed tenant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
@@ -426,45 +389,6 @@ components:
|
||||
type: string
|
||||
state:
|
||||
type: string
|
||||
TenantCreateInfo:
|
||||
type: object
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
gc_period:
|
||||
type: string
|
||||
gc_horizon:
|
||||
type: integer
|
||||
pitr_interval:
|
||||
type: string
|
||||
checkpoint_distance:
|
||||
type: integer
|
||||
compaction_period:
|
||||
type: string
|
||||
compaction_threshold:
|
||||
type: string
|
||||
TenantConfigInfo:
|
||||
type: object
|
||||
properties:
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
gc_period:
|
||||
type: string
|
||||
gc_horizon:
|
||||
type: integer
|
||||
pitr_interval:
|
||||
type: string
|
||||
checkpoint_distance:
|
||||
type: integer
|
||||
compaction_period:
|
||||
type: string
|
||||
compaction_threshold:
|
||||
type: string
|
||||
TimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
@@ -485,7 +409,6 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- awaits_download
|
||||
- remote_consistent_lsn
|
||||
properties:
|
||||
awaits_download:
|
||||
type: boolean
|
||||
|
||||
@@ -1,45 +1,36 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
use zenith_utils::http::endpoint::auth_middleware;
|
||||
use zenith_utils::http::endpoint::check_permission;
|
||||
use zenith_utils::http::error::ApiError;
|
||||
use zenith_utils::http::{
|
||||
endpoint,
|
||||
error::HttpErrorBody,
|
||||
json::{json_request, json_response},
|
||||
request::parse_request_param,
|
||||
};
|
||||
use zenith_utils::http::{RequestExt, RouterBuilder};
|
||||
use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse,
|
||||
TimelineCreateRequest,
|
||||
};
|
||||
use crate::config::RemoteStorageKind;
|
||||
use crate::remote_storage::{
|
||||
download_index_part, schedule_timeline_download, LocalFs, RemoteIndex, RemoteTimeline, S3Bucket,
|
||||
StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest,
|
||||
};
|
||||
use crate::remote_storage::{schedule_timeline_download, RemoteIndex};
|
||||
use crate::repository::Repository;
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
json::{json_request, json_response},
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
}
|
||||
|
||||
enum GenericRemoteStorage {
|
||||
Local(LocalFs),
|
||||
S3(S3Bucket),
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -47,34 +38,17 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> Self {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
// Note that this remote storage is created separately from the main one in the sync_loop.
|
||||
// It's fine since it's stateless and some code duplication saves us from bloating the code around with generics.
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
LocalFs::new(root.clone(), &conf.workdir).map(GenericRemoteStorage::Local)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
S3Bucket::new(s3_config, &conf.workdir).map(GenericRemoteStorage::S3)
|
||||
}
|
||||
})
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
|
||||
Ok(Self {
|
||||
Self {
|
||||
conf,
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_index,
|
||||
remote_storage,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,8 +122,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -179,47 +153,43 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
|
||||
|
||||
let (local_timeline_info, remote_timeline_info) = async {
|
||||
// any error here will render local timeline as None
|
||||
// XXX .in_current_span does not attach messages in spawn_blocking future to current future's span
|
||||
let local_timeline_info = tokio::task::spawn_blocking(move || {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
};
|
||||
Ok::<_, anyhow::Error>(local_timeline)
|
||||
})
|
||||
.await
|
||||
.ok()
|
||||
.and_then(|r| r.ok())
|
||||
.flatten();
|
||||
let span = info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id);
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
let (local_timeline_info, span) = tokio::task::spawn_blocking(move || {
|
||||
let entered = span.entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
};
|
||||
(local_timeline_info, remote_timeline_info)
|
||||
}
|
||||
.instrument(info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
Ok::<_, anyhow::Error>((local_timeline, entered.exit()))
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
})
|
||||
};
|
||||
|
||||
let _enter = span.entered();
|
||||
|
||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||
return Err(ApiError::NotFound(
|
||||
@@ -242,103 +212,39 @@ async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
info!(
|
||||
"Handling timeline {} attach for tenant: {}",
|
||||
timeline_id, tenant_id,
|
||||
);
|
||||
let span = info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id);
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() {
|
||||
let span = tokio::task::spawn_blocking(move || {
|
||||
let entered = span.entered();
|
||||
if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() {
|
||||
// TODO: maybe answer with 309 Not Modified here?
|
||||
anyhow::bail!("Timeline is already present locally")
|
||||
};
|
||||
Ok(())
|
||||
Ok(entered.exit())
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let sync_id = ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
let mut remote_index_write = get_state(&request).remote_index.write().await;
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
remote_timeline.awaits_download = true;
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
} else {
|
||||
// no timeline in the index, release the lock to make the potentially lengthy download opetation
|
||||
drop(index_accessor);
|
||||
}
|
||||
|
||||
let new_timeline = match try_download_shard_data(state, sync_id).await {
|
||||
Ok(Some(mut new_timeline)) => {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
new_timeline.awaits_download = true;
|
||||
new_timeline
|
||||
}
|
||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote timeline data: {:?}", e);
|
||||
return Err(ApiError::NotFound(
|
||||
"Failed to retrieve remote timeline".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
match index_accessor.timeline_entry_mut(&sync_id) {
|
||||
Some(remote_timeline) => {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
remote_timeline.awaits_download = true;
|
||||
}
|
||||
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
|
||||
}
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn try_download_shard_data(
|
||||
state: &State,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<Option<RemoteTimeline>> {
|
||||
let shard = match state.remote_storage.as_ref() {
|
||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||
download_index_part(state.conf, local_storage, sync_id).await
|
||||
}
|
||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||
download_index_part(state.conf, s3_storage, sync_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
.with_context(|| format!("Failed to download index shard for timeline {}", sync_id))?;
|
||||
|
||||
let timeline_path = state
|
||||
.conf
|
||||
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
RemoteTimeline::from_index_part(&timeline_path, shard)
|
||||
.map(Some)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to convert index shard into remote timeline for timeline {}",
|
||||
sync_id
|
||||
)
|
||||
let _enter = span.entered(); // entered guard cannot live across awaits (non Send)
|
||||
let index_entry = remote_index_write
|
||||
.timeline_entry_mut(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.ok_or_else(|| ApiError::NotFound("Unknown remote timeline".to_string()))?;
|
||||
|
||||
if index_entry.get_awaits_download() {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
index_entry.set_awaits_download(true);
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -351,8 +257,8 @@ async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let _enter =
|
||||
info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
|
||||
.entered();
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id)
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
repo.detach_timeline(timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -369,7 +275,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
crate::tenant_mgr::list_tenants()
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -381,28 +287,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
let remote_index = get_state(&request).remote_index.clone();
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
let target_tenant_id = request_data
|
||||
.new_tenant_id
|
||||
.map(ZTenantId::from)
|
||||
@@ -410,9 +294,8 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let new_tenant_id = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
|
||||
let conf = get_config(&request);
|
||||
|
||||
tenant_mgr::create_tenant_repository(conf, tenant_conf, target_tenant_id, remote_index)
|
||||
tenant_mgr::create_tenant_repository(get_config(&request), target_tenant_id, remote_index)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -423,45 +306,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
})
|
||||
}
|
||||
|
||||
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantConfigRequest = json_request(&mut request).await?;
|
||||
let tenant_id = request_data.tenant_id;
|
||||
// check for management permission
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let mut tenant_conf: TenantConfOpt = Default::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
|
||||
|
||||
tenant_mgr::update_tenant_config(tenant_conf, tenant_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -473,7 +317,7 @@ pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
if auth.is_some() {
|
||||
@@ -487,14 +331,11 @@ pub fn make_router(
|
||||
}))
|
||||
}
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_index).context("Failed to initialize router state")?,
|
||||
))
|
||||
router
|
||||
.data(Arc::new(State::new(conf, auth, remote_index)))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.put("/v1/tenant/config", tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.get(
|
||||
@@ -509,5 +350,5 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
||||
timeline_detach_handler,
|
||||
)
|
||||
.any(handler_404))
|
||||
.any(handler_404)
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ use postgres_ffi::waldecoder::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,20 +1,12 @@
|
||||
//!
|
||||
//! Functions for reading and writing variable-sized "blobs".
|
||||
//!
|
||||
//! Each blob begins with a 1- or 4-byte length field, followed by the
|
||||
//! actual data. If the length is smaller than 128 bytes, the length
|
||||
//! is written as a one byte. If it's larger than that, the length
|
||||
//! is written as a four-byte integer, in big-endian, with the high
|
||||
//! bit set. This way, we can detect whether it's 1- or 4-byte header
|
||||
//! by peeking at the first byte.
|
||||
//!
|
||||
//! len < 128: 0XXXXXXX
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//! Each blob begins with a 4-byte length, followed by the actual data.
|
||||
//!
|
||||
use crate::layered_repository::block_io::{BlockCursor, BlockReader};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::io::Error;
|
||||
|
||||
/// For reading
|
||||
pub trait BlobCursor {
|
||||
@@ -48,30 +40,21 @@ where
|
||||
|
||||
let mut buf = self.read_blk(blknum)?;
|
||||
|
||||
// peek at the first byte, to determine if it's a 1- or 4-byte length
|
||||
let first_len_byte = buf[off];
|
||||
let len: usize = if first_len_byte < 0x80 {
|
||||
// 1-byte length header
|
||||
off += 1;
|
||||
first_len_byte as usize
|
||||
// read length
|
||||
let mut len_buf = [0u8; 4];
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it is split across two pages
|
||||
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum)?;
|
||||
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
// 4-byte length header
|
||||
let mut len_buf = [0u8; 4];
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it is split across two pages
|
||||
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum)?;
|
||||
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||
off += 4;
|
||||
}
|
||||
len_buf[0] &= 0x7f;
|
||||
u32::from_be_bytes(len_buf) as usize
|
||||
};
|
||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||
off += 4;
|
||||
}
|
||||
let len = u32::from_ne_bytes(len_buf) as usize;
|
||||
|
||||
dstbuf.clear();
|
||||
|
||||
@@ -147,27 +130,10 @@ where
|
||||
{
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
||||
let offset = self.offset;
|
||||
|
||||
if srcbuf.len() < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
let len_buf = srcbuf.len() as u8;
|
||||
self.inner.write_all(&[len_buf])?;
|
||||
self.offset += 1;
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if srcbuf.len() > 0x7fff_ffff {
|
||||
return Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({} bytes)", srcbuf.len()),
|
||||
));
|
||||
}
|
||||
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
self.inner.write_all(&len_buf)?;
|
||||
self.offset += 4;
|
||||
}
|
||||
self.inner
|
||||
.write_all(&((srcbuf.len()) as u32).to_ne_bytes())?;
|
||||
self.inner.write_all(srcbuf)?;
|
||||
self.offset += srcbuf.len() as u64;
|
||||
self.offset += 4 + srcbuf.len() as u64;
|
||||
Ok(offset)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,6 +198,7 @@ impl BlockWriter for BlockBuf {
|
||||
assert!(buf.len() == PAGE_SZ);
|
||||
let blknum = self.blocks.len();
|
||||
self.blocks.push(buf);
|
||||
tracing::info!("buffered block {}", blknum);
|
||||
Ok(blknum as u32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,10 +35,11 @@ use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
@@ -50,11 +51,8 @@ use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -224,7 +222,6 @@ impl Layer for DeltaLayer {
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!(self.key_range.contains(&key));
|
||||
@@ -258,18 +255,8 @@ impl Layer for DeltaLayer {
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
let buf = cursor.read_blob(pos).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let buf = cursor.read_blob(pos)?;
|
||||
let val = Value::des(&buf)?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
@@ -300,10 +287,7 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
|
||||
let inner = match self.load() {
|
||||
Ok(inner) => inner,
|
||||
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
||||
};
|
||||
let inner = self.load().unwrap();
|
||||
|
||||
match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
@@ -435,9 +419,7 @@ impl DeltaLayer {
|
||||
drop(inner);
|
||||
let inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(inner).with_context(|| {
|
||||
format!("Failed to load delta layer {}", self.path().display())
|
||||
})?;
|
||||
self.load_inner(inner)?;
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
//! - page-oriented
|
||||
//!
|
||||
//! TODO:
|
||||
//! - better errors (e.g. with thiserror?)
|
||||
//! - maybe something like an Adaptive Radix Tree would be more efficient?
|
||||
//! - the values stored by image and delta layers are offsets into the file,
|
||||
//! and they are in monotonically increasing order. Prefix compression would
|
||||
@@ -18,12 +19,11 @@
|
||||
//! - An Iterator interface would be more convenient for the callers than the
|
||||
//! 'visit' function
|
||||
//!
|
||||
use anyhow;
|
||||
use byteorder::{ReadBytesExt, BE};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use hex;
|
||||
use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::layered_repository::block_io::{BlockReader, BlockWriter};
|
||||
|
||||
@@ -86,23 +86,6 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum DiskBtreeError {
|
||||
#[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)]
|
||||
AppendOverflow(u64),
|
||||
|
||||
#[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")]
|
||||
UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> },
|
||||
|
||||
#[error("Could not push to new leaf node")]
|
||||
FailedToPushToNewLeafNode,
|
||||
|
||||
#[error("IoError: {0}")]
|
||||
Io(#[from] io::Error),
|
||||
}
|
||||
|
||||
pub type Result<T> = result::Result<T, DiskBtreeError>;
|
||||
|
||||
/// This is the on-disk representation.
|
||||
struct OnDiskNode<'a, const L: usize> {
|
||||
// Fixed-width fields
|
||||
@@ -123,12 +106,12 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
///
|
||||
/// Interpret a PAGE_SZ page as a node.
|
||||
///
|
||||
fn deparse(buf: &[u8]) -> Result<OnDiskNode<L>> {
|
||||
fn deparse(buf: &[u8]) -> OnDiskNode<L> {
|
||||
let mut cursor = std::io::Cursor::new(buf);
|
||||
let num_children = cursor.read_u16::<BE>()?;
|
||||
let level = cursor.read_u8()?;
|
||||
let prefix_len = cursor.read_u8()?;
|
||||
let suffix_len = cursor.read_u8()?;
|
||||
let num_children = cursor.read_u16::<BE>().unwrap();
|
||||
let level = cursor.read_u8().unwrap();
|
||||
let prefix_len = cursor.read_u8().unwrap();
|
||||
let suffix_len = cursor.read_u8().unwrap();
|
||||
|
||||
let mut off = cursor.position();
|
||||
let prefix_off = off as usize;
|
||||
@@ -146,7 +129,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
let keys = &buf[keys_off..keys_off + keys_len];
|
||||
let values = &buf[values_off..values_off + values_len];
|
||||
|
||||
Ok(OnDiskNode {
|
||||
OnDiskNode {
|
||||
num_children,
|
||||
level,
|
||||
prefix_len,
|
||||
@@ -154,7 +137,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
prefix,
|
||||
keys,
|
||||
values,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -166,11 +149,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
Value::from_slice(value_slice)
|
||||
}
|
||||
|
||||
fn binary_search(
|
||||
&self,
|
||||
search_key: &[u8; L],
|
||||
keybuf: &mut [u8],
|
||||
) -> result::Result<usize, usize> {
|
||||
fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result<usize, usize> {
|
||||
let mut size = self.num_children as usize;
|
||||
let mut low = 0;
|
||||
let mut high = size;
|
||||
@@ -230,7 +209,7 @@ where
|
||||
///
|
||||
/// Read the value for given key. Returns the value, or None if it doesn't exist.
|
||||
///
|
||||
pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
|
||||
pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result<Option<u64>> {
|
||||
let mut result: Option<u64> = None;
|
||||
self.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
if key == search_key {
|
||||
@@ -251,7 +230,7 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
mut visitor: V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
@@ -264,7 +243,7 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
@@ -281,11 +260,11 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
let node = OnDiskNode::deparse(node_buf)?;
|
||||
let node = OnDiskNode::deparse(node_buf);
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
@@ -390,15 +369,15 @@ where
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
pub fn dump(&self) -> anyhow::Result<()> {
|
||||
self.dump_recurse(self.root_blk, &[], 0)
|
||||
}
|
||||
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
let node = OnDiskNode::<L>::deparse(buf);
|
||||
|
||||
print!("{:indent$}", "", indent = depth * 2);
|
||||
println!(
|
||||
@@ -463,24 +442,17 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> {
|
||||
if value > MAX_VALUE {
|
||||
return Err(DiskBtreeError::AppendOverflow(value));
|
||||
}
|
||||
pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> {
|
||||
assert!(value <= MAX_VALUE);
|
||||
if let Some(last_key) = &self.last_key {
|
||||
if key <= last_key {
|
||||
return Err(DiskBtreeError::UnsortedInput {
|
||||
key: key.as_slice().into(),
|
||||
last_key: last_key.as_slice().into(),
|
||||
});
|
||||
}
|
||||
assert!(key > last_key, "unsorted input");
|
||||
}
|
||||
self.last_key = Some(*key);
|
||||
|
||||
self.append_internal(key, Value::from_u64(value))
|
||||
Ok(self.append_internal(key, Value::from_u64(value))?)
|
||||
}
|
||||
|
||||
fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> {
|
||||
fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> {
|
||||
// Try to append to the current leaf buffer
|
||||
let last = self.stack.last_mut().unwrap();
|
||||
let level = last.level;
|
||||
@@ -504,15 +476,14 @@ where
|
||||
// key to it.
|
||||
let mut last = BuildNode::new(level);
|
||||
if !last.push(key, value) {
|
||||
return Err(DiskBtreeError::FailedToPushToNewLeafNode);
|
||||
panic!("could not push to new leaf node");
|
||||
}
|
||||
|
||||
self.stack.push(last);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_node(&mut self) -> Result<()> {
|
||||
fn flush_node(&mut self) -> Result<(), std::io::Error> {
|
||||
let last = self.stack.pop().unwrap();
|
||||
let buf = last.pack();
|
||||
let downlink_key = last.first_key();
|
||||
@@ -534,7 +505,7 @@ where
|
||||
/// (In the image and delta layers, it is stored in the beginning of the file,
|
||||
/// in the summary header)
|
||||
///
|
||||
pub fn finish(mut self) -> Result<(u32, W)> {
|
||||
pub fn finish(mut self) -> Result<(u32, W), std::io::Error> {
|
||||
// flush all levels, except the root.
|
||||
while self.stack.len() > 1 {
|
||||
self.flush_node()?;
|
||||
@@ -721,14 +692,14 @@ mod tests {
|
||||
impl BlockReader for TestDisk {
|
||||
type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::rc::Rc::new(buf))
|
||||
}
|
||||
}
|
||||
impl BlockWriter for &mut TestDisk {
|
||||
fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
|
||||
fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error> {
|
||||
let blknum = self.blocks.len();
|
||||
self.blocks.push(buf);
|
||||
Ok(blknum as u32)
|
||||
@@ -736,7 +707,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() -> Result<()> {
|
||||
fn basic() -> anyhow::Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
|
||||
|
||||
@@ -817,7 +788,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lots_of_keys() -> Result<()> {
|
||||
fn lots_of_keys() -> anyhow::Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
|
||||
|
||||
@@ -911,7 +882,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_data() -> Result<()> {
|
||||
fn random_data() -> anyhow::Result<()> {
|
||||
// Generate random keys with exponential distribution, to
|
||||
// exercise the prefix compression
|
||||
const NUM_KEYS: usize = 100000;
|
||||
@@ -956,27 +927,21 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "unsorted input")]
|
||||
fn unsorted_input() {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk);
|
||||
|
||||
let _ = writer.append(b"ba", 1);
|
||||
let _ = writer.append(b"bb", 2);
|
||||
let err = writer.append(b"aa", 3).expect_err("should've failed");
|
||||
match err {
|
||||
DiskBtreeError::UnsortedInput { key, last_key } => {
|
||||
assert_eq!(key.as_ref(), b"aa".as_slice());
|
||||
assert_eq!(last_key.as_ref(), b"bb".as_slice());
|
||||
}
|
||||
_ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"),
|
||||
}
|
||||
let _ = writer.append(b"aa", 3);
|
||||
}
|
||||
|
||||
///
|
||||
/// This test contains a particular data set, see disk_btree_test_data.rs
|
||||
///
|
||||
#[test]
|
||||
fn particular_data() -> Result<()> {
|
||||
fn particular_data() -> anyhow::Result<()> {
|
||||
// Build a tree from it
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
|
||||
|
||||
@@ -16,8 +16,8 @@ use std::io::{Error, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tracing::*;
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
@@ -199,24 +199,18 @@ impl BlobWriter for EphemeralFile {
|
||||
let mut buf = self.get_buf_for_write(blknum)?;
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
buf[off] = srcbuf.len() as u8;
|
||||
off += 1;
|
||||
let len_buf = u32::to_ne_bytes(srcbuf.len() as u32);
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
buf[off..off + 4].copy_from_slice(&len_buf);
|
||||
off += 4;
|
||||
}
|
||||
buf[off..off + 4].copy_from_slice(&len_buf);
|
||||
off += 4;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
@@ -235,13 +229,7 @@ impl BlobWriter for EphemeralFile {
|
||||
buf_remain = &buf_remain[this_blk_len..];
|
||||
}
|
||||
drop(buf);
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.size += 1;
|
||||
} else {
|
||||
self.size += 4;
|
||||
}
|
||||
self.size += srcbuf.len() as u64;
|
||||
self.size += 4 + srcbuf.len() as u64;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
@@ -256,31 +244,16 @@ impl Drop for EphemeralFile {
|
||||
// remove entry from the hash map
|
||||
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
|
||||
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
if let Err(e) = res {
|
||||
warn!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
// unlink file
|
||||
// FIXME: print error
|
||||
let _ = std::fs::remove_file(&self.file.path);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
file.path.display(),
|
||||
e
|
||||
),
|
||||
)),
|
||||
}
|
||||
file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
@@ -399,12 +372,6 @@ mod tests {
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
// also test with a large blobs
|
||||
for i in 0..100 {
|
||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
|
||||
@@ -30,10 +30,12 @@ use crate::layered_repository::storage_layer::{
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use hex;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
@@ -41,13 +43,9 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -150,7 +148,6 @@ impl Layer for ImageLayer {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
@@ -254,9 +251,7 @@ impl ImageLayer {
|
||||
drop(inner);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(&mut inner).with_context(|| {
|
||||
format!("Failed to load image layer {}", self.path().display())
|
||||
})?
|
||||
self.load_inner(&mut inner)?;
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
|
||||
@@ -14,21 +14,19 @@ use crate::layered_repository::storage_layer::{
|
||||
};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use log::*;
|
||||
use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
vec_map::VecMap,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -115,7 +113,7 @@ impl Layer for InMemoryLayer {
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
ensure!(lsn_range.start <= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
@@ -126,6 +124,13 @@ impl Layer for InMemoryLayer {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
match &reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
return Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
|
||||
@@ -16,12 +16,12 @@ use crate::layered_repository::InMemoryLayer;
|
||||
use crate::repository::Key;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use metrics::{register_int_gauge, IntGauge};
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_ONDISK_LAYERS: IntGauge =
|
||||
@@ -43,13 +43,10 @@ pub struct LayerMap {
|
||||
pub next_open_layer_at: Option<Lsn>,
|
||||
|
||||
///
|
||||
/// Frozen layers, if any. Frozen layers are in-memory layers that
|
||||
/// are no longer added to, but haven't been written out to disk
|
||||
/// yet. They contain WAL older than the current 'open_layer' or
|
||||
/// 'next_open_layer_at', but newer than any historic layer.
|
||||
/// The frozen layers are in order from oldest to newest, so that
|
||||
/// the newest one is in the 'back' of the VecDeque, and the oldest
|
||||
/// in the 'front'.
|
||||
/// The frozen layer, if any, contains WAL older than the current 'open_layer'
|
||||
/// or 'next_open_layer_at', but newer than any historic layer. The frozen
|
||||
/// layer is during checkpointing, when an InMemoryLayer is being written out
|
||||
/// to disk.
|
||||
///
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::path::PathBuf;
|
||||
|
||||
use anyhow::ensure;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
use zenith_utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
|
||||
@@ -4,15 +4,13 @@
|
||||
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
|
||||
@@ -7,11 +7,9 @@ pub mod layered_repository;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
pub mod reltag;
|
||||
pub mod remote_storage;
|
||||
pub mod repository;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_threads;
|
||||
pub mod thread_mgr;
|
||||
@@ -24,10 +22,13 @@ pub mod walredo;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use tracing::info;
|
||||
use utils::postgres_backend;
|
||||
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
use zenith_utils::{
|
||||
postgres_backend,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
|
||||
use layered_repository::LayeredRepository;
|
||||
use pgdatadir_mapping::DatadirTimeline;
|
||||
@@ -67,7 +68,7 @@ pub type RepositoryImpl = LayeredRepository;
|
||||
|
||||
pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
|
||||
|
||||
pub fn shutdown_pageserver(exit_code: i32) {
|
||||
pub fn shutdown_pageserver() {
|
||||
// Shut down the libpq endpoint thread. This prevents new connections from
|
||||
// being accepted.
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
|
||||
@@ -94,5 +95,5 @@ pub fn shutdown_pageserver(exit_code: i32) {
|
||||
thread_mgr::shutdown_threads(None, None, None);
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ use std::{
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use utils::{
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
@@ -19,20 +19,20 @@ use std::net::TcpListener;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, RwLockReadGuard};
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
auth::{self, Claims, JwtAuth, Scope},
|
||||
lsn::Lsn,
|
||||
postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend},
|
||||
pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC},
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::auth::{self, JwtAuth};
|
||||
use zenith_utils::auth::{Claims, Scope};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::is_socket_read_timed_out;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::postgres_backend::{self, AuthType};
|
||||
use zenith_utils::pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::Timeline;
|
||||
@@ -41,8 +41,6 @@ use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use crate::walreceiver;
|
||||
use crate::CheckpointConfig;
|
||||
use metrics::{register_histogram_vec, HistogramVec};
|
||||
use postgres_ffi::xlog_utils::to_pg_timestamp;
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamFeMessage {
|
||||
@@ -327,17 +325,14 @@ impl PageServerHandler {
|
||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
while !thread_mgr::is_shutdown_requested() {
|
||||
let msg = pgb.read_message();
|
||||
|
||||
let profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
|
||||
match msg {
|
||||
match pgb.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(message) = message {
|
||||
trace!("query: {:?}", message);
|
||||
@@ -389,7 +384,6 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
drop(profiling_guard);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -523,7 +517,7 @@ impl PageServerHandler {
|
||||
info!("starting");
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
@@ -657,7 +651,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;
|
||||
@@ -668,10 +662,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("failpoints ") {
|
||||
ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support");
|
||||
|
||||
let (_, failpoints) = query_string.split_at("failpoints ".len());
|
||||
|
||||
for failpoint in failpoints.split(';') {
|
||||
if let Some((name, actions)) = failpoint.split_once('=') {
|
||||
info!("cfg failpoint: {} {}", name, actions);
|
||||
@@ -681,39 +672,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
}
|
||||
}
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("show ") {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
ensure!(params.len() == 1, "invalid param number for config command");
|
||||
let tenantid = ZTenantId::from_str(params[0])?;
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"compaction_target_size"),
|
||||
RowDescriptor::int8_col(b"compaction_period"),
|
||||
RowDescriptor::int8_col(b"compaction_threshold"),
|
||||
RowDescriptor::int8_col(b"gc_horizon"),
|
||||
RowDescriptor::int8_col(b"gc_period"),
|
||||
RowDescriptor::int8_col(b"image_creation_threshold"),
|
||||
RowDescriptor::int8_col(b"pitr_interval"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(repo.get_checkpoint_distance().to_string().as_bytes()),
|
||||
Some(repo.get_compaction_target_size().to_string().as_bytes()),
|
||||
Some(
|
||||
repo.get_compaction_period()
|
||||
.as_secs()
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(repo.get_compaction_threshold().to_string().as_bytes()),
|
||||
Some(repo.get_gc_horizon().to_string().as_bytes()),
|
||||
Some(repo.get_gc_period().as_secs().to_string().as_bytes()),
|
||||
Some(repo.get_image_creation_threshold().to_string().as_bytes()),
|
||||
Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("do_gc ") {
|
||||
// Run GC immediately on given timeline.
|
||||
// FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
|
||||
@@ -731,20 +689,16 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
|
||||
let gc_horizon: u64 = caps
|
||||
.get(4)
|
||||
.map(|h| h.as_str().parse())
|
||||
.unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;
|
||||
.unwrap_or(Ok(self.conf.gc_horizon))?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, Duration::ZERO, true)?;
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layers_total"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_pitr"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layers_not_updated"),
|
||||
RowDescriptor::int8_col(b"layers_removed"),
|
||||
@@ -753,33 +707,12 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(result.layers_total.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_pitr.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_branches.to_string().as_bytes()),
|
||||
Some(result.layers_not_updated.to_string().as_bytes()),
|
||||
Some(result.layers_removed.to_string().as_bytes()),
|
||||
Some(result.elapsed.as_millis().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("compact ") {
|
||||
// Run compaction immediately on given timeline.
|
||||
// FIXME This is just for tests. Don't expect this to be exposed to
|
||||
// the users or the api.
|
||||
|
||||
// compact <tenant_id> <timeline_id>
|
||||
let re = Regex::new(r"^compact ([[:xdigit:]]+)\s([[:xdigit:]]+)($|\s)?").unwrap();
|
||||
|
||||
let caps = re
|
||||
.captures(query_string)
|
||||
.with_context(|| format!("Invalid compact: '{}'", query_string))?;
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
.context("Couldn't load timeline")?;
|
||||
timeline.tline.compact()?;
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("checkpoint ") {
|
||||
// Run checkpoint immediately on given timeline.
|
||||
|
||||
@@ -793,7 +726,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
timeline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
@@ -806,33 +739,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("get_lsn_by_timestamp ") {
|
||||
// Locate LSN of last transaction with timestamp less or equal than sppecified
|
||||
// TODO lazy static
|
||||
let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$")
|
||||
.unwrap();
|
||||
let caps = re
|
||||
.captures(query_string)
|
||||
.with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?;
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?;
|
||||
let timestamp_pg = to_pg_timestamp(timestamp);
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"lsn",
|
||||
)]))?;
|
||||
let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? {
|
||||
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
};
|
||||
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
bail!("unknown command");
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@ use crate::repository::{Repository, Timeline};
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::xlog_utils::TimestampTz;
|
||||
use postgres_ffi::{pg_constants, Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
@@ -21,7 +20,8 @@ use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicIsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, RwLockReadGuard};
|
||||
use tracing::{debug, error, trace, warn};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
|
||||
pub type BlockNumber = u32;
|
||||
@@ -46,13 +46,6 @@ where
|
||||
current_logical_size: AtomicIsize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
Present(Lsn),
|
||||
Future(Lsn),
|
||||
Past(Lsn),
|
||||
}
|
||||
|
||||
impl<R: Repository> DatadirTimeline<R> {
|
||||
pub fn new(tline: Arc<R::Timeline>, repartition_threshold: u64) -> Self {
|
||||
DatadirTimeline {
|
||||
@@ -210,106 +203,6 @@ impl<R: Repository> DatadirTimeline<R> {
|
||||
Ok(exists)
|
||||
}
|
||||
|
||||
/// Locate LSN, such that all transactions that committed before
|
||||
/// 'search_timestamp' are visible, but nothing newer is.
|
||||
///
|
||||
/// This is not exact. Commit timestamps are not guaranteed to be ordered,
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
|
||||
let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
let max_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
// LSNs are always 8-byte aligned. low/mid/high represent the
|
||||
// LSN divided by 8.
|
||||
let mut low = min_lsn.0 / 8;
|
||||
let mut high = max_lsn.0 / 8 + 1;
|
||||
|
||||
let mut found_smaller = false;
|
||||
let mut found_larger = false;
|
||||
while low < high {
|
||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||
let mid = (high + low) / 2;
|
||||
|
||||
let cmp = self.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
)?;
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
match (found_smaller, found_larger) {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
// just after importing a cluster.
|
||||
bail!("no commit timestamps found");
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
Ok(LsnForTimestamp::Future(max_lsn))
|
||||
}
|
||||
(false, true) => {
|
||||
// Didn't find any commit timestamps smaller than the request
|
||||
Ok(LsnForTimestamp::Past(max_lsn))
|
||||
}
|
||||
(true, true) => {
|
||||
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||
// Back off by one to get to the point just before the commit.
|
||||
//
|
||||
// FIXME: it would be better to get the LSN of the previous commit.
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
|
||||
/// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
|
||||
///
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
found_smaller: &mut bool,
|
||||
found_larger: &mut bool,
|
||||
) -> Result<bool> {
|
||||
for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
|
||||
let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page =
|
||||
self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
|
||||
|
||||
if clog_page.len() == pg_constants::BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return Ok(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
|
||||
// fetch directory entry
|
||||
@@ -1319,7 +1212,7 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
||||
#[cfg(test)]
|
||||
pub fn create_test_timeline<R: Repository>(
|
||||
repo: R,
|
||||
timeline_id: utils::zid::ZTimelineId,
|
||||
timeline_id: zenith_utils::zid::ZTimelineId,
|
||||
) -> Result<Arc<crate::DatadirTimeline<R>>> {
|
||||
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
|
||||
let tline = DatadirTimeline::new(tline, 256 * 1024);
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
//!
|
||||
//! Support for profiling
|
||||
//!
|
||||
//! This relies on a modified version of the 'pprof-rs' crate. That's not very
|
||||
//! nice, so to avoid a hard dependency on that, this is an optional feature.
|
||||
//!
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
|
||||
/// The actual implementation is in the `profiling_impl` submodule. If the profiling
|
||||
/// feature is not enabled, it's just a dummy implementation that panics if you
|
||||
/// try to enabled profiling in the configuration.
|
||||
pub use profiling_impl::*;
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
mod profiling_impl {
|
||||
use super::*;
|
||||
use pprof;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
/// Start profiling the current thread. Returns a guard object;
|
||||
/// the profiling continues until the guard is dropped.
|
||||
///
|
||||
/// Note: profiling is not re-entrant. If you call 'profpoint_start' while
|
||||
/// profiling is already started, nothing happens, and the profiling will be
|
||||
/// stopped when either guard object is dropped.
|
||||
#[inline]
|
||||
pub fn profpoint_start(
|
||||
conf: &crate::config::PageServerConf,
|
||||
point: ProfilingConfig,
|
||||
) -> Option<ProfilingGuard> {
|
||||
if conf.profiling == point {
|
||||
pprof::start_profiling();
|
||||
Some(ProfilingGuard(PhantomData))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// A hack to remove Send and Sync from the ProfilingGuard. Because the
|
||||
/// profiling is attached to current thread.
|
||||
////
|
||||
/// See comments in https://github.com/rust-lang/rust/issues/68318
|
||||
type PhantomUnsend = std::marker::PhantomData<*mut u8>;
|
||||
|
||||
pub struct ProfilingGuard(PhantomUnsend);
|
||||
|
||||
impl Drop for ProfilingGuard {
|
||||
fn drop(&mut self) {
|
||||
pprof::stop_profiling();
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the profiler. This must be called before any 'profpoint_start' calls.
|
||||
pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
|
||||
if conf.profiling != ProfilingConfig::Disabled {
|
||||
Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Exit the profiler. Writes the flamegraph to current workdir.
|
||||
pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
|
||||
// Write out the flamegraph
|
||||
if let Some(profiler_guard) = profiler_guard {
|
||||
if let Ok(report) = profiler_guard.report().build() {
|
||||
// this gets written under the workdir
|
||||
let file = std::fs::File::create("flamegraph.svg").unwrap();
|
||||
let mut options = pprof::flamegraph::Options::default();
|
||||
options.image_width = Some(2500);
|
||||
report.flamegraph_with_options(file, &mut options).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
|
||||
#[cfg(not(feature = "profiling"))]
|
||||
mod profiling_impl {
|
||||
use super::*;
|
||||
|
||||
pub struct DummyProfilerGuard;
|
||||
|
||||
pub fn profpoint_start(
|
||||
_conf: &PageServerConf,
|
||||
_point: ProfilingConfig,
|
||||
) -> Option<DummyProfilerGuard> {
|
||||
None
|
||||
}
|
||||
|
||||
pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
|
||||
if conf.profiling != ProfilingConfig::Disabled {
|
||||
// shouldn't happen, we don't allow profiling in the config if the support
|
||||
// for it is disabled.
|
||||
panic!("profiling enabled but the binary was compiled without profiling support");
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
|
||||
}
|
||||
@@ -9,6 +9,7 @@
|
||||
//!
|
||||
//! * synchronization logic at [`storage_sync`] module that keeps pageserver state (both runtime one and the workdir files) and storage state in sync.
|
||||
//! Synchronization internals are split into submodules
|
||||
//! * [`storage_sync::compression`] for a custom remote storage format used to store timeline files in archives
|
||||
//! * [`storage_sync::index`] to keep track of remote tenant files, the metadata and their mappings to local files
|
||||
//! * [`storage_sync::upload`] and [`storage_sync::download`] to manage archive creation and upload; download and extraction, respectively
|
||||
//!
|
||||
@@ -53,32 +54,25 @@
|
||||
//! The checkpoint uploads are disabled, if no remote storage configuration is provided (no sync loop is started this way either).
|
||||
//! See [`crate::layered_repository`] for the upload calls and the adjacent logic.
|
||||
//!
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, [`crate::repository::TimelineSyncStatusUpdate`],
|
||||
//! submitted via [`crate::tenant_mgr::apply_timeline_sync_status_updates`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Synchronization logic is able to communicate back with updated timeline sync states, [`TimelineSyncState`],
|
||||
//! submitted via [`crate::tenant_mgr::set_timeline_states`] function. Tenant manager applies corresponding timeline updates in pageserver's in-memory state.
|
||||
//! Such submissions happen in two cases:
|
||||
//! * once after the sync loop startup, to signal pageserver which timelines will be synchronized in the near future
|
||||
//! * after every loop step, in case a timeline needs to be reloaded or evicted from pageserver's memory
|
||||
//!
|
||||
//! When the pageserver terminates, the sync loop finishes a current sync task (if any) and exits.
|
||||
//! When the pageserver terminates, the upload loop finishes a current sync task (if any) and exits.
|
||||
//!
|
||||
//! The storage logic considers `image` as a set of local files (layers), fully representing a certain timeline at given moment (identified with `disk_consistent_lsn` from the corresponding `metadata` file).
|
||||
//! The storage logic considers `image` as a set of local files, fully representing a certain timeline at given moment (identified with `disk_consistent_lsn`).
|
||||
//! Timeline can change its state, by adding more files on disk and advancing its `disk_consistent_lsn`: this happens after pageserver checkpointing and is followed
|
||||
//! by the storage upload, if enabled.
|
||||
//! Yet timeline cannot alter already existing files, and cannot remove those too: only a GC process is capable of removing unused files.
|
||||
//! Yet timeline cannot alter already existing files, and normally cannot remote those too: only a GC process is capable of removing unused files.
|
||||
//! This way, remote storage synchronization relies on the fact that every checkpoint is incremental and local files are "immutable":
|
||||
//! * when a certain checkpoint gets uploaded, the sync loop remembers the fact, preventing further reuploads of the same state
|
||||
//! * no files are deleted from either local or remote storage, only the missing ones locally/remotely get downloaded/uploaded, local metadata file will be overwritten
|
||||
//! when the newer image is downloaded
|
||||
//!
|
||||
//! Pageserver maintains similar to the local file structure remotely: all layer files are uploaded with the same names under the same directory structure.
|
||||
//! Yet instead of keeping the `metadata` file remotely, we wrap it with more data in [`IndexShard`], containing the list of remote files.
|
||||
//! This file gets read to populate the cache, if the remote timeline data is missing from it and gets updated after every successful download.
|
||||
//! This way, we optimize S3 storage access by not running the `S3 list` command that could be expencive and slow: knowing both [`ZTenantId`] and [`ZTimelineId`],
|
||||
//! we can always reconstruct the path to the timeline, use this to get the same path on the remote storage and retrive its shard contents, if needed, same as any layer files.
|
||||
//!
|
||||
//! By default, pageserver reads the remote storage index data only for timelines located locally, to synchronize those, if needed.
|
||||
//! Bulk index data download happens only initially, on pageserer startup. The rest of the remote storage stays unknown to pageserver and loaded on demand only,
|
||||
//! when a new timeline is scheduled for the download.
|
||||
//! To optimize S3 storage (and access), the sync loop compresses the checkpoint files before placing them to S3, and uncompresses them back, keeping track of timeline files and metadata.
|
||||
//! Also, the remote file list is queried once only, at startup, to avoid possible extra costs and latency issues.
|
||||
//!
|
||||
//! NOTES:
|
||||
//! * pageserver assumes it has exclusive write access to the remote storage. If supported, the way multiple pageservers can be separated in the same storage
|
||||
@@ -92,7 +86,7 @@ mod s3_bucket;
|
||||
mod storage_sync;
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
collections::HashMap,
|
||||
ffi, fs,
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
@@ -100,37 +94,22 @@ use std::{
|
||||
use anyhow::{bail, Context};
|
||||
use tokio::io;
|
||||
use tracing::{debug, error, info};
|
||||
use zenith_utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use self::storage_sync::TEMP_DOWNLOAD_EXTENSION;
|
||||
pub use self::{
|
||||
local_fs::LocalFs,
|
||||
s3_bucket::S3Bucket,
|
||||
storage_sync::{
|
||||
download_index_part,
|
||||
index::{IndexPart, RemoteIndex, RemoteTimeline},
|
||||
schedule_timeline_checkpoint_upload, schedule_timeline_download,
|
||||
},
|
||||
};
|
||||
pub use self::storage_sync::index::{RemoteIndex, TimelineIndexEntry};
|
||||
pub use self::storage_sync::{schedule_timeline_checkpoint_upload, schedule_timeline_download};
|
||||
use self::{local_fs::LocalFs, s3_bucket::S3Bucket};
|
||||
use crate::layered_repository::ephemeral_file::is_ephemeral_file;
|
||||
use crate::{
|
||||
config::{PageServerConf, RemoteStorageKind},
|
||||
layered_repository::{
|
||||
ephemeral_file::is_ephemeral_file,
|
||||
metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
},
|
||||
layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
};
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
/// A timeline status to share with pageserver's sync counterpart,
|
||||
/// after comparing local and remote timeline state.
|
||||
pub use storage_sync::compression;
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum LocalTimelineInitStatus {
|
||||
/// The timeline has every remote layer present locally.
|
||||
/// There could be some layers requiring uploading,
|
||||
/// but this does not block the timeline from any user interaction.
|
||||
LocallyComplete,
|
||||
/// A timeline has some files remotely, that are not present locally and need downloading.
|
||||
/// Downloading might update timeline's metadata locally and current pageserver logic deals with local layers only,
|
||||
/// so the data needs to be downloaded first before the timeline can be used.
|
||||
NeedsSync,
|
||||
}
|
||||
|
||||
@@ -162,7 +141,7 @@ pub fn start_local_timeline_sync(
|
||||
config,
|
||||
local_timeline_files,
|
||||
LocalFs::new(root.clone(), &config.workdir)?,
|
||||
storage_config.max_concurrent_timelines_sync,
|
||||
storage_config.max_concurrent_sync,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
},
|
||||
@@ -173,7 +152,7 @@ pub fn start_local_timeline_sync(
|
||||
config,
|
||||
local_timeline_files,
|
||||
S3Bucket::new(s3_config, &config.workdir)?,
|
||||
storage_config.max_concurrent_timelines_sync,
|
||||
storage_config.max_concurrent_sync,
|
||||
storage_config.max_sync_errors,
|
||||
)
|
||||
},
|
||||
@@ -200,7 +179,7 @@ pub fn start_local_timeline_sync(
|
||||
|
||||
fn local_tenant_timeline_files(
|
||||
config: &'static PageServerConf,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
|
||||
let mut local_tenant_timeline_files = HashMap::new();
|
||||
let tenants_dir = config.tenants_path();
|
||||
for tenants_dir_entry in fs::read_dir(&tenants_dir)
|
||||
@@ -235,8 +214,9 @@ fn local_tenant_timeline_files(
|
||||
fn collect_timelines_for_tenant(
|
||||
config: &'static PageServerConf,
|
||||
tenant_path: &Path,
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, HashSet<PathBuf>)>> {
|
||||
let mut timelines = HashMap::new();
|
||||
) -> anyhow::Result<HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)>> {
|
||||
let mut timelines: HashMap<ZTenantTimelineId, (TimelineMetadata, Vec<PathBuf>)> =
|
||||
HashMap::new();
|
||||
let tenant_id = tenant_path
|
||||
.file_name()
|
||||
.and_then(ffi::OsStr::to_str)
|
||||
@@ -285,8 +265,8 @@ fn collect_timelines_for_tenant(
|
||||
// NOTE: ephemeral files are excluded from the list
|
||||
fn collect_timeline_files(
|
||||
timeline_dir: &Path,
|
||||
) -> anyhow::Result<(ZTimelineId, TimelineMetadata, HashSet<PathBuf>)> {
|
||||
let mut timeline_files = HashSet::new();
|
||||
) -> anyhow::Result<(ZTimelineId, TimelineMetadata, Vec<PathBuf>)> {
|
||||
let mut timeline_files = Vec::new();
|
||||
let mut timeline_metadata_path = None;
|
||||
|
||||
let timeline_id = timeline_dir
|
||||
@@ -305,29 +285,12 @@ fn collect_timeline_files(
|
||||
} else if is_ephemeral_file(&entry_path.file_name().unwrap().to_string_lossy()) {
|
||||
debug!("skipping ephemeral file {}", entry_path.display());
|
||||
continue;
|
||||
} else if entry_path.extension().and_then(ffi::OsStr::to_str)
|
||||
== Some(TEMP_DOWNLOAD_EXTENSION)
|
||||
{
|
||||
info!("removing temp download file at {}", entry_path.display());
|
||||
fs::remove_file(&entry_path).with_context(|| {
|
||||
format!(
|
||||
"failed to remove temp download file at {}",
|
||||
entry_path.display()
|
||||
)
|
||||
})?;
|
||||
} else {
|
||||
timeline_files.insert(entry_path);
|
||||
timeline_files.push(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME (rodionov) if attach call succeeded, and then pageserver is restarted before download is completed
|
||||
// then attach is lost. There would be no retries for that,
|
||||
// initial collect will fail because there is no metadata.
|
||||
// We either need to start download if we see empty dir after restart or attach caller should
|
||||
// be aware of that and retry attach if awaits_download for timeline switched from true to false
|
||||
// but timelinne didnt appear locally.
|
||||
// Check what happens with remote index in that case.
|
||||
let timeline_metadata_path = match timeline_metadata_path {
|
||||
Some(path) => path,
|
||||
None => bail!("No metadata file found in the timeline directory"),
|
||||
@@ -344,7 +307,7 @@ fn collect_timeline_files(
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type StoragePath;
|
||||
|
||||
@@ -361,9 +324,6 @@ pub trait RemoteStorage: Send + Sync {
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_bytes: usize,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
52
pageserver/src/remote_storage/README.md
Normal file
52
pageserver/src/remote_storage/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# Non-implementation details
|
||||
|
||||
This document describes the current state of the backup system in pageserver, existing limitations and concerns, why some things are done the way they are the future development plans.
|
||||
Detailed description on how the synchronization works and how it fits into the rest of the pageserver can be found in the [storage module](./../remote_storage.rs) and its submodules.
|
||||
Ideally, this document should disappear after current implementation concerns are mitigated, with the remaining useful knowledge bits moved into rustdocs.
|
||||
|
||||
## Approach
|
||||
|
||||
Backup functionality is a new component, appeared way after the core DB functionality was implemented.
|
||||
Pageserver layer functionality is also quite volatile at the moment, there's a risk its local file management changes over time.
|
||||
|
||||
To avoid adding more chaos into that, backup functionality is currently designed as a relatively standalone component, with the majority of its logic placed in a standalone async loop.
|
||||
This way, the backups are managed in background, not affecting directly other pageserver parts: this way the backup and restoration process may lag behind, but eventually keep up with the reality. To track that, a set of prometheus metrics is exposed from pageserver.
|
||||
|
||||
## What's done
|
||||
|
||||
Current implementation
|
||||
* provides remote storage wrappers for AWS S3 and local FS
|
||||
* synchronizes the differences with local timelines and remote states as fast as possible
|
||||
* uploads new layer files
|
||||
* downloads and registers timelines, found on the remote storage, but missing locally, if those are requested somehow via pageserver (e.g. http api, gc)
|
||||
* uses compression when deals with files, for better S3 usage
|
||||
* maintains an index of what's stored remotely
|
||||
* evicts failing tasks and stops the corresponding timelines
|
||||
|
||||
The tasks are delayed with every retry and the retries are capped, to avoid poisonous tasks.
|
||||
After any task eviction, or any error at startup checks (e.g. obviously different and wrong local and remote states fot the same timeline),
|
||||
the timeline has to be stopped from submitting further checkpoint upload tasks, which is done along the corresponding timeline status change.
|
||||
|
||||
No good optimisations or performance testing is done, the feature is disabled by default and gets polished over time.
|
||||
It's planned to deal with all questions that are currently on and prepare the feature to be enabled by default in cloud environments.
|
||||
|
||||
### Peculiarities
|
||||
|
||||
As mentioned, the backup component is rather new and under development currently, so not all things are done properly from the start.
|
||||
Here's the list of known compromises with comments:
|
||||
|
||||
* Remote storage file model is currently a custom archive format, that's not possible to deserialize without a particular Rust code of ours (including `serde`).
|
||||
We also don't optimize the archivation and pack every timeline checkpoint separately, so the resulting blob's size that gets on S3 could be arbitrary.
|
||||
But, it's a single blob, which is way better than storing ~780 small files separately.
|
||||
|
||||
* Archive index restoration requires reading every blob's head.
|
||||
This could be avoided by a background thread/future storing the serialized index in the remote storage.
|
||||
|
||||
* no proper file comparison
|
||||
|
||||
No file checksum assertion is done currently, but should be (AWS S3 returns file checksums during the `list` operation)
|
||||
|
||||
* gc is ignored
|
||||
|
||||
So far, we don't adjust the remote storage based on GC thread loop results, only checkpointer loop affects the remote storage.
|
||||
Index module could be used as a base to implement a deferred GC mechanism, a "defragmentation" that repacks archives into new ones after GC is done removing the files from the archives.
|
||||
@@ -17,8 +17,6 @@ use tokio::{
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use crate::remote_storage::storage_sync::path_with_suffix_extension;
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
pub struct LocalFs {
|
||||
@@ -106,8 +104,7 @@ impl RemoteStorage for LocalFs {
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
mut from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -116,7 +113,7 @@ impl RemoteStorage for LocalFs {
|
||||
// We need this dance with sort of durable rename (without fsyncs)
|
||||
// to prevent partial uploads. This was really hit when pageserver shutdown
|
||||
// cancelled the upload and partial file was left on the fs
|
||||
let temp_file_path = path_with_suffix_extension(&target_file_path, "temp");
|
||||
let temp_file_path = path_with_suffix_extension(&target_file_path, ".temp");
|
||||
let mut destination = io::BufWriter::new(
|
||||
fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -131,11 +128,7 @@ impl RemoteStorage for LocalFs {
|
||||
})?,
|
||||
);
|
||||
|
||||
let from_size_bytes = from_size_bytes as u64;
|
||||
// Require to read 1 byte more than the expected to check later, that the stream and its size match.
|
||||
let mut buffer_to_read = from.take(from_size_bytes + 1);
|
||||
|
||||
let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
|
||||
io::copy(&mut from, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
@@ -144,19 +137,6 @@ impl RemoteStorage for LocalFs {
|
||||
)
|
||||
})?;
|
||||
|
||||
ensure!(
|
||||
bytes_read == from_size_bytes,
|
||||
"Provided stream has actual size {} fthat is smaller than the given stream size {}",
|
||||
bytes_read,
|
||||
from_size_bytes
|
||||
);
|
||||
|
||||
ensure!(
|
||||
buffer_to_read.read(&mut [0]).await? == 0,
|
||||
"Provided stream has bigger size than the given stream size {}",
|
||||
from_size_bytes
|
||||
);
|
||||
|
||||
destination.flush().await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload (flush temp) file to the local storage at '{}'",
|
||||
@@ -301,8 +281,15 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
}
|
||||
|
||||
fn path_with_suffix_extension(original_path: &Path, suffix: &str) -> PathBuf {
|
||||
let mut extension_with_suffix = original_path.extension().unwrap_or_default().to_os_string();
|
||||
extension_with_suffix.push(suffix);
|
||||
|
||||
original_path.with_extension(extension_with_suffix)
|
||||
}
|
||||
|
||||
fn storage_metadata_path(original_path: &Path) -> PathBuf {
|
||||
path_with_suffix_extension(original_path, "metadata")
|
||||
path_with_suffix_extension(original_path, ".metadata")
|
||||
}
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
@@ -522,13 +509,13 @@ mod fs_tests {
|
||||
let repo_harness = RepoHarness::create("upload_file")?;
|
||||
let storage = create_storage()?;
|
||||
|
||||
let (file, size) = create_file_for_upload(
|
||||
let source = create_file_for_upload(
|
||||
&storage.pageserver_workdir.join("whatever"),
|
||||
"whatever_contents",
|
||||
)
|
||||
.await?;
|
||||
let target_path = PathBuf::from("/").join("somewhere").join("else");
|
||||
match storage.upload(file, size, &target_path, None).await {
|
||||
match storage.upload(source, &target_path, None).await {
|
||||
Ok(()) => panic!("Should not allow storing files with wrong target path"),
|
||||
Err(e) => {
|
||||
let message = format!("{:?}", e);
|
||||
@@ -813,17 +800,24 @@ mod fs_tests {
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let relative_timeline_path = timeline_path.strip_prefix(&harness.conf.workdir)?;
|
||||
let storage_path = storage.root.join(relative_timeline_path).join(name);
|
||||
|
||||
let from_path = storage.pageserver_workdir.join(name);
|
||||
let (file, size) = create_file_for_upload(&from_path, &dummy_contents(name)).await?;
|
||||
storage.upload(file, size, &storage_path, metadata).await?;
|
||||
storage
|
||||
.upload(
|
||||
create_file_for_upload(
|
||||
&storage.pageserver_workdir.join(name),
|
||||
&dummy_contents(name),
|
||||
)
|
||||
.await?,
|
||||
&storage_path,
|
||||
metadata,
|
||||
)
|
||||
.await?;
|
||||
Ok(storage_path)
|
||||
}
|
||||
|
||||
async fn create_file_for_upload(
|
||||
path: &Path,
|
||||
contents: &str,
|
||||
) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
|
||||
) -> anyhow::Result<io::BufReader<fs::File>> {
|
||||
std::fs::create_dir_all(path.parent().unwrap())?;
|
||||
let mut file_for_writing = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
@@ -831,10 +825,8 @@ mod fs_tests {
|
||||
.open(path)?;
|
||||
write!(file_for_writing, "{}", contents)?;
|
||||
drop(file_for_writing);
|
||||
let file_size = path.metadata()?.len() as usize;
|
||||
Ok((
|
||||
io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
|
||||
file_size,
|
||||
Ok(io::BufReader::new(
|
||||
fs::OpenOptions::new().read(true).open(&path).await?,
|
||||
))
|
||||
}
|
||||
|
||||
|
||||
@@ -15,9 +15,9 @@ use rusoto_s3::{
|
||||
DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
|
||||
StreamingBody, S3,
|
||||
};
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio::io;
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
use tracing::{debug, trace};
|
||||
|
||||
use crate::{
|
||||
config::S3Config,
|
||||
@@ -65,15 +65,15 @@ pub struct S3Bucket {
|
||||
client: S3Client,
|
||||
bucket_name: String,
|
||||
prefix_in_bucket: Option<String>,
|
||||
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
|
||||
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
|
||||
// The helps to ensure we don't exceed the thresholds.
|
||||
concurrency_limiter: Semaphore,
|
||||
}
|
||||
|
||||
impl S3Bucket {
|
||||
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
|
||||
pub fn new(aws_config: &S3Config, pageserver_workdir: &'static Path) -> anyhow::Result<Self> {
|
||||
// TODO kb check this
|
||||
// Keeping a single client may cause issues due to timeouts.
|
||||
// https://github.com/rusoto/rusoto/issues/1686
|
||||
|
||||
debug!(
|
||||
"Creating s3 remote storage for S3 bucket {}",
|
||||
aws_config.bucket_name
|
||||
@@ -91,10 +91,10 @@ impl S3Bucket {
|
||||
let request_dispatcher = HttpClient::new().context("Failed to create S3 http client")?;
|
||||
let client = if aws_config.access_key_id.is_none() && aws_config.secret_access_key.is_none()
|
||||
{
|
||||
debug!("Using IAM-based AWS access");
|
||||
trace!("Using IAM-based AWS access");
|
||||
S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
|
||||
} else {
|
||||
debug!("Using credentials-based AWS access");
|
||||
trace!("Using credentials-based AWS access");
|
||||
S3Client::new_with(
|
||||
request_dispatcher,
|
||||
StaticProvider::new_minimal(
|
||||
@@ -123,7 +123,6 @@ impl S3Bucket {
|
||||
pageserver_workdir,
|
||||
bucket_name: aws_config.bucket_name.clone(),
|
||||
prefix_in_bucket,
|
||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -152,11 +151,6 @@ impl RemoteStorage for S3Bucket {
|
||||
|
||||
let mut continuation_token = None;
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list")?;
|
||||
let fetch_response = self
|
||||
.client
|
||||
.list_objects_v2(ListObjectsV2Request {
|
||||
@@ -186,21 +180,12 @@ impl RemoteStorage for S3Bucket {
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
from_size_bytes: usize,
|
||||
to: &Self::StoragePath,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 upload")?;
|
||||
self.client
|
||||
.put_object(PutObjectRequest {
|
||||
body: Some(StreamingBody::new_with_size(
|
||||
ReaderStream::new(from),
|
||||
from_size_bytes,
|
||||
)),
|
||||
body: Some(StreamingBody::new(ReaderStream::new(from))),
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: to.key().to_owned(),
|
||||
metadata: metadata.map(|m| m.0),
|
||||
@@ -215,11 +200,6 @@ impl RemoteStorage for S3Bucket {
|
||||
from: &Self::StoragePath,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")?;
|
||||
let object_output = self
|
||||
.client
|
||||
.get_object(GetObjectRequest {
|
||||
@@ -251,11 +231,6 @@ impl RemoteStorage for S3Bucket {
|
||||
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
|
||||
None => format!("bytes={}-", start_inclusive),
|
||||
});
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 range download")?;
|
||||
let object_output = self
|
||||
.client
|
||||
.get_object(GetObjectRequest {
|
||||
@@ -275,11 +250,6 @@ impl RemoteStorage for S3Bucket {
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::StoragePath) -> anyhow::Result<()> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 delete")?;
|
||||
self.client
|
||||
.delete_object(DeleteObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
@@ -463,7 +433,6 @@ mod tests {
|
||||
client: S3Client::new("us-east-1".parse().unwrap()),
|
||||
bucket_name: "dummy-bucket".to_string(),
|
||||
prefix_in_bucket: Some("dummy_prefix/".to_string()),
|
||||
concurrency_limiter: Semaphore::new(1),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
612
pageserver/src/remote_storage/storage_sync/compression.rs
Normal file
612
pageserver/src/remote_storage/storage_sync/compression.rs
Normal file
@@ -0,0 +1,612 @@
|
||||
//! A set of structs to represent a compressed part of the timeline, and methods to asynchronously compress and uncompress a stream of data,
|
||||
//! without holding the entire data in memory.
|
||||
//! For the latter, both compress and uncompress functions operate buffered streams (currently hardcoded size of [`ARCHIVE_STREAM_BUFFER_SIZE_BYTES`]),
|
||||
//! not attempting to hold the entire archive in memory.
|
||||
//!
|
||||
//! The compression is done with <a href="https://datatracker.ietf.org/doc/html/rfc8878">zstd</a> streaming algorithm via the `async-compression` crate.
|
||||
//! The crate does not contain any knobs to tweak the compression, but otherwise is one of the only ones that's both async and has an API to manage the part of an archive.
|
||||
//! Zstd was picked as the best algorithm among the ones available in the crate, after testing the initial timeline file compression.
|
||||
//!
|
||||
//! Archiving is almost agnostic to timeline file types, with an exception of the metadata file, that's currently distinguished in the [un]compression code.
|
||||
//! The metadata file is treated separately when [de]compression is involved, to reduce the risk of corrupting the metadata file.
|
||||
//! When compressed, the metadata file is always required and stored as the last file in the archive stream.
|
||||
//! When uncompressed, the metadata file gets naturally uncompressed last, to ensure that all other layer files are decompressed successfully first.
|
||||
//!
|
||||
//! Archive structure:
|
||||
//! +----------------------------------------+
|
||||
//! | header | file_1, ..., file_k, metadata |
|
||||
//! +----------------------------------------+
|
||||
//!
|
||||
//! The archive consists of two separate zstd archives:
|
||||
//! * header archive, that contains all files names and their sizes and relative paths in the timeline directory
|
||||
//! Header is a Rust structure, serialized into bytes and compressed with zstd.
|
||||
//! * files archive, that has metadata file as the last one, all compressed with zstd into a single binary blob
|
||||
//!
|
||||
//! Header offset is stored in the file name, along with the `disk_consistent_lsn` from the metadata file.
|
||||
//! See [`parse_archive_name`] and [`ARCHIVE_EXTENSION`] for the name details, example: `00000000016B9150-.zst_9732`.
|
||||
//! This way, the header could be retrieved without reading an entire archive file.
|
||||
|
||||
use std::{
|
||||
collections::BTreeSet,
|
||||
future::Future,
|
||||
io::Cursor,
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use async_compression::tokio::bufread::{ZstdDecoder, ZstdEncoder};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncReadExt, AsyncWriteExt},
|
||||
};
|
||||
use tracing::*;
|
||||
use zenith_utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
use crate::layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME};
|
||||
|
||||
use super::index::RelativePath;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct ArchiveHeader {
|
||||
/// All regular timeline files, excluding the metadata file.
|
||||
pub files: Vec<FileEntry>,
|
||||
// Metadata file name is known to the system, as its location relative to the timeline dir,
|
||||
// so no need to store anything but its size in bytes.
|
||||
pub metadata_file_size: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||
pub struct FileEntry {
|
||||
/// Uncompressed file size, bytes.
|
||||
pub size: u64,
|
||||
/// A path, relative to the directory root, used when compressing the directory contents.
|
||||
pub subpath: RelativePath,
|
||||
}
|
||||
|
||||
const ARCHIVE_EXTENSION: &str = "-.zst_";
|
||||
const ARCHIVE_STREAM_BUFFER_SIZE_BYTES: usize = 4 * 1024 * 1024;
|
||||
|
||||
/// Streams an archive of files given into a stream target, defined by the closure.
|
||||
///
|
||||
/// The closure approach is picked for cases like S3, where we would need a name of the file before we can get a stream to write the bytes into.
|
||||
/// Current idea is to place the header size in the name of the file, to enable the fast partial remote file index restoration without actually reading remote storage file contents.
|
||||
///
|
||||
/// Performs the compression in multiple steps:
|
||||
/// * prepares an archive header, stripping the `source_dir` prefix from the `files`
|
||||
/// * generates the name of the archive
|
||||
/// * prepares archive producer future, knowing the header and the file list
|
||||
/// An `impl AsyncRead` and `impl AsyncWrite` pair of connected streams is created to implement the partial contents streaming.
|
||||
/// The writer end gets into the archive producer future, to put the header and a stream of compressed files.
|
||||
/// * prepares archive consumer future, by executing the provided closure
|
||||
/// The closure gets the reader end stream and the name of the file to create a future that would stream the file contents elsewhere.
|
||||
/// * runs and waits for both futures to complete
|
||||
/// * on a successful completion of both futures, header, its size and the user-defined consumer future return data is returned
|
||||
/// Due to the design above, the archive name and related data is visible inside the consumer future only, so it's possible to return the data,
|
||||
/// needed for future processing.
|
||||
pub async fn archive_files_as_stream<Cons, ConsRet, Fut>(
|
||||
source_dir: &Path,
|
||||
files: impl Iterator<Item = &PathBuf>,
|
||||
metadata: &TimelineMetadata,
|
||||
create_archive_consumer: Cons,
|
||||
) -> anyhow::Result<(ArchiveHeader, u64, ConsRet)>
|
||||
where
|
||||
Cons: FnOnce(Box<dyn io::AsyncRead + Unpin + Send + Sync + 'static>, String) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
Fut: Future<Output = anyhow::Result<ConsRet>> + Send + 'static,
|
||||
ConsRet: Send + Sync + 'static,
|
||||
{
|
||||
let metadata_bytes = metadata
|
||||
.to_bytes()
|
||||
.context("Failed to create metadata bytes")?;
|
||||
let (archive_header, compressed_header_bytes) =
|
||||
prepare_header(source_dir, files, &metadata_bytes)
|
||||
.await
|
||||
.context("Failed to prepare file for archivation")?;
|
||||
|
||||
let header_size = compressed_header_bytes.len() as u64;
|
||||
let (write, read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
|
||||
let archive_filler = write_archive_contents(
|
||||
source_dir.to_path_buf(),
|
||||
archive_header.clone(),
|
||||
metadata_bytes,
|
||||
write,
|
||||
);
|
||||
let archive_name = archive_name(metadata.disk_consistent_lsn(), header_size);
|
||||
let archive_stream =
|
||||
Cursor::new(compressed_header_bytes).chain(ZstdEncoder::new(io::BufReader::new(read)));
|
||||
|
||||
let (archive_creation_result, archive_upload_result) = tokio::join!(
|
||||
tokio::spawn(archive_filler),
|
||||
tokio::spawn(async move {
|
||||
create_archive_consumer(Box::new(archive_stream), archive_name).await
|
||||
})
|
||||
);
|
||||
archive_creation_result
|
||||
.context("Failed to spawn archive creation future")?
|
||||
.context("Failed to create an archive")?;
|
||||
let upload_return_value = archive_upload_result
|
||||
.context("Failed to spawn archive upload future")?
|
||||
.context("Failed to upload the archive")?;
|
||||
|
||||
Ok((archive_header, header_size, upload_return_value))
|
||||
}
|
||||
|
||||
/// Similar to [`archive_files_as_stream`], creates a pair of streams to uncompress the 2nd part of the archive,
|
||||
/// that contains files and is located after the header.
|
||||
/// S3 allows downloading partial file contents for a given file key (i.e. name), to accommodate this retrieval,
|
||||
/// a closure is used.
|
||||
/// Same concepts with two concurrent futures, user-defined closure, future and return value apply here, but the
|
||||
/// consumer and the receiver ends are swapped, since the uncompression happens.
|
||||
pub async fn uncompress_file_stream_with_index<Prod, ProdRet, Fut>(
|
||||
destination_dir: PathBuf,
|
||||
files_to_skip: Arc<BTreeSet<PathBuf>>,
|
||||
disk_consistent_lsn: Lsn,
|
||||
header: ArchiveHeader,
|
||||
header_size: u64,
|
||||
create_archive_file_part: Prod,
|
||||
) -> anyhow::Result<ProdRet>
|
||||
where
|
||||
Prod: FnOnce(Box<dyn io::AsyncWrite + Unpin + Send + Sync + 'static>, String) -> Fut
|
||||
+ Send
|
||||
+ 'static,
|
||||
Fut: Future<Output = anyhow::Result<ProdRet>> + Send + 'static,
|
||||
ProdRet: Send + Sync + 'static,
|
||||
{
|
||||
let (write, mut read) = io::duplex(ARCHIVE_STREAM_BUFFER_SIZE_BYTES);
|
||||
let archive_name = archive_name(disk_consistent_lsn, header_size);
|
||||
|
||||
let (archive_download_result, archive_uncompress_result) = tokio::join!(
|
||||
tokio::spawn(async move { create_archive_file_part(Box::new(write), archive_name).await }),
|
||||
tokio::spawn(async move {
|
||||
uncompress_with_header(&files_to_skip, &destination_dir, header, &mut read).await
|
||||
})
|
||||
);
|
||||
|
||||
let download_value = archive_download_result
|
||||
.context("Failed to spawn archive download future")?
|
||||
.context("Failed to download an archive")?;
|
||||
archive_uncompress_result
|
||||
.context("Failed to spawn archive uncompress future")?
|
||||
.context("Failed to uncompress the archive")?;
|
||||
|
||||
Ok(download_value)
|
||||
}
|
||||
|
||||
/// Reads archive header from the stream given:
|
||||
/// * parses the file name to get the header size
|
||||
/// * reads the exact amount of bytes
|
||||
/// * uncompresses and deserializes those
|
||||
pub async fn read_archive_header<A: io::AsyncRead + Send + Sync + Unpin>(
|
||||
archive_name: &str,
|
||||
from: &mut A,
|
||||
) -> anyhow::Result<ArchiveHeader> {
|
||||
let (_, header_size) = parse_archive_name(Path::new(archive_name))?;
|
||||
|
||||
let mut compressed_header_bytes = vec![0; header_size as usize];
|
||||
from.read_exact(&mut compressed_header_bytes)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to read header header from the archive {}",
|
||||
archive_name
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut header_bytes = Vec::new();
|
||||
ZstdDecoder::new(io::BufReader::new(compressed_header_bytes.as_slice()))
|
||||
.read_to_end(&mut header_bytes)
|
||||
.await
|
||||
.context("Failed to decompress a header from the archive")?;
|
||||
|
||||
ArchiveHeader::des(&header_bytes).context("Failed to deserialize a header from the archive")
|
||||
}
|
||||
|
||||
/// Reads the archive metadata out of the archive name:
|
||||
/// * `disk_consistent_lsn` of the checkpoint that was archived
|
||||
/// * size of the archive header
|
||||
pub fn parse_archive_name(archive_path: &Path) -> anyhow::Result<(Lsn, u64)> {
|
||||
let archive_name = archive_path
|
||||
.file_name()
|
||||
.with_context(|| format!("Archive '{}' has no file name", archive_path.display()))?
|
||||
.to_string_lossy();
|
||||
let (lsn_str, header_size_str) =
|
||||
archive_name
|
||||
.rsplit_once(ARCHIVE_EXTENSION)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has incorrect extension, expected to contain '{}'",
|
||||
archive_path.display(),
|
||||
ARCHIVE_EXTENSION
|
||||
)
|
||||
})?;
|
||||
let disk_consistent_lsn = Lsn::from_hex(lsn_str).with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has an invalid disk consistent lsn in its extension",
|
||||
archive_path.display(),
|
||||
)
|
||||
})?;
|
||||
let header_size = header_size_str.parse::<u64>().with_context(|| {
|
||||
format!(
|
||||
"Archive '{}' has an invalid a header offset number in its extension",
|
||||
archive_path.display(),
|
||||
)
|
||||
})?;
|
||||
Ok((disk_consistent_lsn, header_size))
|
||||
}
|
||||
|
||||
fn archive_name(disk_consistent_lsn: Lsn, header_size: u64) -> String {
|
||||
let archive_name = format!(
|
||||
"{:016X}{ARCHIVE_EXTENSION}{}",
|
||||
u64::from(disk_consistent_lsn),
|
||||
header_size,
|
||||
ARCHIVE_EXTENSION = ARCHIVE_EXTENSION,
|
||||
);
|
||||
archive_name
|
||||
}
|
||||
|
||||
pub async fn uncompress_with_header(
|
||||
files_to_skip: &BTreeSet<PathBuf>,
|
||||
destination_dir: &Path,
|
||||
header: ArchiveHeader,
|
||||
archive_after_header: impl io::AsyncRead + Send + Sync + Unpin,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Uncompressing archive into {}", destination_dir.display());
|
||||
let mut archive = ZstdDecoder::new(io::BufReader::new(archive_after_header));
|
||||
|
||||
if !destination_dir.exists() {
|
||||
fs::create_dir_all(&destination_dir)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to create target directory at {}",
|
||||
destination_dir.display()
|
||||
)
|
||||
})?;
|
||||
} else if !destination_dir.is_dir() {
|
||||
bail!(
|
||||
"Destination path '{}' is not a valid directory",
|
||||
destination_dir.display()
|
||||
);
|
||||
}
|
||||
debug!("Will extract {} files from the archive", header.files.len());
|
||||
for entry in header.files {
|
||||
uncompress_entry(
|
||||
&mut archive,
|
||||
&entry.subpath.as_path(destination_dir),
|
||||
entry.size,
|
||||
files_to_skip,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to uncompress archive entry {:?}", entry))?;
|
||||
}
|
||||
uncompress_entry(
|
||||
&mut archive,
|
||||
&destination_dir.join(METADATA_FILE_NAME),
|
||||
header.metadata_file_size,
|
||||
files_to_skip,
|
||||
)
|
||||
.await
|
||||
.context("Failed to uncompress the metadata entry")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn uncompress_entry(
|
||||
archive: &mut ZstdDecoder<io::BufReader<impl io::AsyncRead + Send + Sync + Unpin>>,
|
||||
destination_path: &Path,
|
||||
entry_size: u64,
|
||||
files_to_skip: &BTreeSet<PathBuf>,
|
||||
) -> anyhow::Result<()> {
|
||||
if let Some(parent) = destination_path.parent() {
|
||||
fs::create_dir_all(parent).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create parent directory for {}",
|
||||
destination_path.display()
|
||||
)
|
||||
})?;
|
||||
};
|
||||
|
||||
if files_to_skip.contains(destination_path) {
|
||||
debug!("Skipping {}", destination_path.display());
|
||||
copy_n_bytes(entry_size, archive, &mut io::sink())
|
||||
.await
|
||||
.context("Failed to skip bytes in the archive")?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut destination =
|
||||
io::BufWriter::new(fs::File::create(&destination_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open file {} for extraction",
|
||||
destination_path.display()
|
||||
)
|
||||
})?);
|
||||
copy_n_bytes(entry_size, archive, &mut destination)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to write extracted archive contents into file {}",
|
||||
destination_path.display()
|
||||
)
|
||||
})?;
|
||||
destination
|
||||
.flush()
|
||||
.await
|
||||
.context("Failed to flush the streaming archive bytes")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn write_archive_contents(
|
||||
source_dir: PathBuf,
|
||||
header: ArchiveHeader,
|
||||
metadata_bytes: Vec<u8>,
|
||||
mut archive_input: io::DuplexStream,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Starting writing files into archive");
|
||||
for file_entry in header.files {
|
||||
let path = file_entry.subpath.as_path(&source_dir);
|
||||
let mut source_file =
|
||||
io::BufReader::new(fs::File::open(&path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to open file for archiving to path {}",
|
||||
path.display()
|
||||
)
|
||||
})?);
|
||||
let bytes_written = io::copy(&mut source_file, &mut archive_input)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open add a file into archive, file path {}",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
file_entry.size == bytes_written,
|
||||
"File {} was written to the archive incompletely",
|
||||
path.display()
|
||||
);
|
||||
trace!(
|
||||
"Added file '{}' ({} bytes) into the archive",
|
||||
path.display(),
|
||||
bytes_written
|
||||
);
|
||||
}
|
||||
let metadata_bytes_written = io::copy(&mut metadata_bytes.as_slice(), &mut archive_input)
|
||||
.await
|
||||
.context("Failed to add metadata into the archive")?;
|
||||
ensure!(
|
||||
header.metadata_file_size == metadata_bytes_written,
|
||||
"Metadata file was written to the archive incompletely",
|
||||
);
|
||||
|
||||
archive_input
|
||||
.shutdown()
|
||||
.await
|
||||
.context("Failed to finalize the archive")?;
|
||||
debug!("Successfully streamed all files into the archive");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn prepare_header(
|
||||
source_dir: &Path,
|
||||
files: impl Iterator<Item = &PathBuf>,
|
||||
metadata_bytes: &[u8],
|
||||
) -> anyhow::Result<(ArchiveHeader, Vec<u8>)> {
|
||||
let mut archive_files = Vec::new();
|
||||
for file_path in files {
|
||||
let file_metadata = fs::metadata(file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to read metadata during archive indexing for {}",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
file_metadata.is_file(),
|
||||
"Archive indexed path {} is not a file",
|
||||
file_path.display()
|
||||
);
|
||||
|
||||
if file_path.file_name().and_then(|name| name.to_str()) != Some(METADATA_FILE_NAME) {
|
||||
let entry = FileEntry {
|
||||
subpath: RelativePath::new(source_dir, file_path).with_context(|| {
|
||||
format!(
|
||||
"File '{}' does not belong to pageserver workspace",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
size: file_metadata.len(),
|
||||
};
|
||||
archive_files.push(entry);
|
||||
}
|
||||
}
|
||||
|
||||
let header = ArchiveHeader {
|
||||
files: archive_files,
|
||||
metadata_file_size: metadata_bytes.len() as u64,
|
||||
};
|
||||
|
||||
debug!("Appending a header for {} files", header.files.len());
|
||||
let header_bytes = header.ser().context("Failed to serialize a header")?;
|
||||
debug!("Header bytes len {}", header_bytes.len());
|
||||
let mut compressed_header_bytes = Vec::new();
|
||||
ZstdEncoder::new(io::BufReader::new(header_bytes.as_slice()))
|
||||
.read_to_end(&mut compressed_header_bytes)
|
||||
.await
|
||||
.context("Failed to compress header bytes")?;
|
||||
debug!(
|
||||
"Compressed header bytes len {}",
|
||||
compressed_header_bytes.len()
|
||||
);
|
||||
Ok((header, compressed_header_bytes))
|
||||
}
|
||||
|
||||
async fn copy_n_bytes(
|
||||
n: u64,
|
||||
from: &mut (impl io::AsyncRead + Send + Sync + Unpin),
|
||||
into: &mut (impl io::AsyncWrite + Send + Sync + Unpin),
|
||||
) -> anyhow::Result<()> {
|
||||
let bytes_written = io::copy(&mut from.take(n), into).await?;
|
||||
ensure!(
|
||||
bytes_written == n,
|
||||
"Failed to read exactly {} bytes from the input, bytes written: {}",
|
||||
n,
|
||||
bytes_written,
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tokio::{fs, io::AsyncSeekExt};
|
||||
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn compress_and_uncompress() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("compress_and_uncompress")?;
|
||||
let timeline_dir = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
init_directory(
|
||||
&timeline_dir,
|
||||
vec![
|
||||
("first", "first_contents"),
|
||||
("second", "second_contents"),
|
||||
(METADATA_FILE_NAME, "wrong_metadata"),
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
let timeline_files = list_file_paths_with_contents(&timeline_dir).await?;
|
||||
assert_eq!(
|
||||
timeline_files,
|
||||
vec![
|
||||
(
|
||||
timeline_dir.join("first"),
|
||||
FileContents::Text("first_contents".to_string())
|
||||
),
|
||||
(
|
||||
timeline_dir.join(METADATA_FILE_NAME),
|
||||
FileContents::Text("wrong_metadata".to_string())
|
||||
),
|
||||
(
|
||||
timeline_dir.join("second"),
|
||||
FileContents::Text("second_contents".to_string())
|
||||
),
|
||||
],
|
||||
"Initial timeline contents should contain two normal files and a wrong metadata file"
|
||||
);
|
||||
|
||||
let metadata = TimelineMetadata::new(Lsn(0x30), None, None, Lsn(0), Lsn(0), Lsn(0));
|
||||
let paths_to_archive = timeline_files
|
||||
.into_iter()
|
||||
.map(|(path, _)| path)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let tempdir = tempfile::tempdir()?;
|
||||
let base_path = tempdir.path().to_path_buf();
|
||||
let (header, header_size, archive_target) = archive_files_as_stream(
|
||||
&timeline_dir,
|
||||
paths_to_archive.iter(),
|
||||
&metadata,
|
||||
move |mut archive_streamer, archive_name| async move {
|
||||
let archive_target = base_path.join(&archive_name);
|
||||
let mut archive_file = fs::File::create(&archive_target).await?;
|
||||
io::copy(&mut archive_streamer, &mut archive_file).await?;
|
||||
Ok(archive_target)
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
let mut file = fs::File::open(&archive_target).await?;
|
||||
file.seek(io::SeekFrom::Start(header_size)).await?;
|
||||
let target_dir = tempdir.path().join("extracted");
|
||||
uncompress_with_header(&BTreeSet::new(), &target_dir, header, file).await?;
|
||||
|
||||
let extracted_files = list_file_paths_with_contents(&target_dir).await?;
|
||||
|
||||
assert_eq!(
|
||||
extracted_files,
|
||||
vec![
|
||||
(
|
||||
target_dir.join("first"),
|
||||
FileContents::Text("first_contents".to_string())
|
||||
),
|
||||
(
|
||||
target_dir.join(METADATA_FILE_NAME),
|
||||
FileContents::Binary(metadata.to_bytes()?)
|
||||
),
|
||||
(
|
||||
target_dir.join("second"),
|
||||
FileContents::Text("second_contents".to_string())
|
||||
),
|
||||
],
|
||||
"Extracted files should contain all local timeline files besides its metadata, which should be taken from the arguments"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn init_directory(
|
||||
root: &Path,
|
||||
files_with_contents: Vec<(&str, &str)>,
|
||||
) -> anyhow::Result<()> {
|
||||
fs::create_dir_all(root).await?;
|
||||
for (file_name, contents) in files_with_contents {
|
||||
fs::File::create(root.join(file_name))
|
||||
.await?
|
||||
.write_all(contents.as_bytes())
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord)]
|
||||
enum FileContents {
|
||||
Text(String),
|
||||
Binary(Vec<u8>),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for FileContents {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::Text(text) => f.debug_tuple("Text").field(text).finish(),
|
||||
Self::Binary(bytes) => f
|
||||
.debug_tuple("Binary")
|
||||
.field(&format!("{} bytes", bytes.len()))
|
||||
.finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_file_paths_with_contents(
|
||||
root: &Path,
|
||||
) -> anyhow::Result<Vec<(PathBuf, FileContents)>> {
|
||||
let mut file_paths = Vec::new();
|
||||
|
||||
let mut dir_listings = vec![fs::read_dir(root).await?];
|
||||
while let Some(mut dir_listing) = dir_listings.pop() {
|
||||
while let Some(entry) = dir_listing.next_entry().await? {
|
||||
let entry_path = entry.path();
|
||||
if entry_path.is_file() {
|
||||
let contents = match String::from_utf8(fs::read(&entry_path).await?) {
|
||||
Ok(text) => FileContents::Text(text),
|
||||
Err(e) => FileContents::Binary(e.into_bytes()),
|
||||
};
|
||||
file_paths.push((entry_path, contents));
|
||||
} else if entry_path.is_dir() {
|
||||
dir_listings.push(fs::read_dir(entry_path).await?);
|
||||
} else {
|
||||
info!(
|
||||
"Skipping path '{}' as it's not a file or a directory",
|
||||
entry_path.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
file_paths.sort();
|
||||
Ok(file_paths)
|
||||
}
|
||||
}
|
||||
@@ -1,73 +1,30 @@
|
||||
//! Timeline synchrnonization logic to fetch the layer files from remote storage into pageserver's local directory.
|
||||
//! Timeline synchrnonization logic to put files from archives on remote storage into pageserver's local directory.
|
||||
|
||||
use std::{collections::HashSet, fmt::Debug, path::Path};
|
||||
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncWriteExt},
|
||||
};
|
||||
use tracing::{debug, error, info, warn};
|
||||
use anyhow::{ensure, Context};
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, trace, warn};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::metadata_path,
|
||||
layered_repository::metadata::{metadata_path, TimelineMetadata},
|
||||
remote_storage::{
|
||||
storage_sync::{path_with_suffix_extension, sync_queue, SyncTask},
|
||||
RemoteStorage,
|
||||
storage_sync::{
|
||||
compression, fetch_full_index, index::TimelineIndexEntryInner, sync_queue, SyncKind,
|
||||
SyncTask,
|
||||
},
|
||||
RemoteStorage, ZTenantTimelineId,
|
||||
},
|
||||
};
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
SyncData, TimelineDownload,
|
||||
index::{ArchiveId, RemoteTimeline},
|
||||
RemoteIndex, TimelineDownload,
|
||||
};
|
||||
|
||||
pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
/// Retrieves index data from the remote storage for a given timeline.
|
||||
pub async fn download_index_part<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<IndexPart>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut index_part_bytes = Vec::new();
|
||||
storage
|
||||
.download(&part_storage_path, &mut index_part_bytes)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to download an index part from storage path {part_storage_path:?}")
|
||||
})?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
|
||||
format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'")
|
||||
})?;
|
||||
|
||||
let missing_files = index_part.missing_files();
|
||||
if !missing_files.is_empty() {
|
||||
warn!("Found missing layers in index part for timeline {sync_id}: {missing_files:?}");
|
||||
}
|
||||
|
||||
Ok(index_part)
|
||||
}
|
||||
|
||||
/// Timeline download result, with extra data, needed for downloading.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum DownloadedTimeline {
|
||||
/// Remote timeline data is either absent or corrupt, no download possible.
|
||||
Abort,
|
||||
@@ -76,202 +33,217 @@ pub(super) enum DownloadedTimeline {
|
||||
FailedAndRescheduled,
|
||||
/// Remote timeline data is found, its latest checkpoint's metadata contents (disk_consistent_lsn) is known.
|
||||
/// Initial download successful.
|
||||
Successful(SyncData<TimelineDownload>),
|
||||
Successful,
|
||||
}
|
||||
|
||||
/// Attempts to download all given timeline's layers.
|
||||
/// Attempts to download and uncompress files from all remote archives for the timeline given.
|
||||
/// Timeline files that already exist locally are skipped during the download, but the local metadata file is
|
||||
/// updated in the end, if the remote one contains a newer disk_consistent_lsn.
|
||||
/// updated in the end of every checkpoint archive extraction.
|
||||
///
|
||||
/// On an error, bumps the retries count and updates the files to skip with successful downloads, rescheduling the task.
|
||||
pub(super) async fn download_timeline_layers<'a, P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &'a S,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut download_data: SyncData<TimelineDownload>,
|
||||
) -> DownloadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
/// On an error, bumps the retries count and reschedules the download, with updated archive skip list
|
||||
/// (for any new successful archive downloads and extractions).
|
||||
pub(super) async fn download_timeline<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let remote_timeline = match remote_timeline {
|
||||
Some(remote_timeline) => {
|
||||
if !remote_timeline.awaits_download {
|
||||
error!("Timeline with sync id {sync_id} is not awaiting download");
|
||||
return DownloadedTimeline::Abort;
|
||||
}
|
||||
remote_timeline
|
||||
}
|
||||
None => {
|
||||
error!("Timeline with sync id {sync_id} is not present in the remote index");
|
||||
return DownloadedTimeline::Abort;
|
||||
}
|
||||
};
|
||||
>(
|
||||
conf: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut download: TimelineDownload,
|
||||
retries: u32,
|
||||
) -> DownloadedTimeline {
|
||||
debug!("Downloading layers for sync id {}", sync_id);
|
||||
|
||||
let download = &mut download_data.data;
|
||||
|
||||
let layers_to_download = remote_timeline
|
||||
.stored_files()
|
||||
.difference(&download.layers_to_skip)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
debug!("Layers to download: {layers_to_download:?}");
|
||||
info!("Downloading {} timeline layers", layers_to_download.len());
|
||||
|
||||
let mut download_tasks = layers_to_download
|
||||
.into_iter()
|
||||
.map(|layer_desination_path| async move {
|
||||
if layer_desination_path.exists() {
|
||||
debug!(
|
||||
"Layer already exists locally, skipping download: {}",
|
||||
layer_desination_path.display()
|
||||
);
|
||||
} else {
|
||||
let layer_storage_path = storage
|
||||
.storage_path(&layer_desination_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
layer_desination_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Perform a rename inspired by durable_rename from file_utils.c.
|
||||
// The sequence:
|
||||
// write(tmp)
|
||||
// fsync(tmp)
|
||||
// rename(tmp, new)
|
||||
// fsync(new)
|
||||
// fsync(parent)
|
||||
// For more context about durable_rename check this email from postgres mailing list:
|
||||
// https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
|
||||
// If pageserver crashes the temp file will be deleted on startup and re-downloaded.
|
||||
let temp_file_path =
|
||||
path_with_suffix_extension(&layer_desination_path, TEMP_DOWNLOAD_EXTENSION);
|
||||
|
||||
let mut destination_file =
|
||||
fs::File::create(&temp_file_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create a destination file for layer '{}'",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.download(&layer_storage_path, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download a layer from storage path '{layer_storage_path:?}'"
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
// that have not yet completed. To ensure that a file is closed immediately when it is dropped,
|
||||
// you should call flush before dropping it.
|
||||
//
|
||||
// From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
|
||||
// we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
|
||||
// But for additional safety lets check/wait for any pending operations.
|
||||
destination_file.flush().await.with_context(|| {
|
||||
format!(
|
||||
"failed to flush source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// not using sync_data because it can lose file size update
|
||||
destination_file.sync_all().await.with_context(|| {
|
||||
format!(
|
||||
"failed to fsync source file at {}",
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
drop(destination_file);
|
||||
|
||||
fail::fail_point!("remote-storage-download-pre-rename", |_| {
|
||||
anyhow::bail!("remote-storage-download-pre-rename failpoint triggered")
|
||||
});
|
||||
|
||||
fs::rename(&temp_file_path, &layer_desination_path).await?;
|
||||
|
||||
fsync_path(&layer_desination_path).await.with_context(|| {
|
||||
format!(
|
||||
"Cannot fsync layer destination path {}",
|
||||
layer_desination_path.display(),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
Ok::<_, anyhow::Error>(layer_desination_path)
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errors_happened = false;
|
||||
// keep files we've downloaded to remove them from layers_to_skip if directory fsync fails
|
||||
let mut undo = HashSet::new();
|
||||
while let Some(download_result) = download_tasks.next().await {
|
||||
match download_result {
|
||||
Ok(downloaded_path) => {
|
||||
undo.insert(downloaded_path.clone());
|
||||
download.layers_to_skip.insert(downloaded_path);
|
||||
}
|
||||
Err(e) => {
|
||||
errors_happened = true;
|
||||
error!("Failed to download a layer for timeline {sync_id}: {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fsync timeline directory which is a parent directory for downloaded files
|
||||
let ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = &sync_id;
|
||||
let timeline_dir = conf.timeline_path(timeline_id, tenant_id);
|
||||
if let Err(e) = fsync_path(&timeline_dir).await {
|
||||
error!(
|
||||
"Cannot fsync parent directory {} error {}",
|
||||
timeline_dir.display(),
|
||||
e
|
||||
);
|
||||
for item in undo {
|
||||
download.layers_to_skip.remove(&item);
|
||||
} = sync_id;
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let index_read = index.read().await;
|
||||
let remote_timeline = match index_read.timeline_entry(&sync_id) {
|
||||
None => {
|
||||
error!("Cannot download: no timeline is present in the index for given id");
|
||||
return DownloadedTimeline::Abort;
|
||||
}
|
||||
|
||||
Some(index_entry) => match index_entry.inner() {
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => Cow::Borrowed(remote_timeline),
|
||||
TimelineIndexEntryInner::Description(_) => {
|
||||
// we do not check here for awaits_download because it is ok
|
||||
// to call this function while the download is in progress
|
||||
// so it is not a concurrent download, it is the same one
|
||||
|
||||
let remote_disk_consistent_lsn = index_entry.disk_consistent_lsn();
|
||||
drop(index_read);
|
||||
debug!("Found timeline description for the given ids, downloading the full index");
|
||||
match fetch_full_index(
|
||||
remote_assets.as_ref(),
|
||||
&conf.timeline_path(&timeline_id, &tenant_id),
|
||||
sync_id,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(remote_timeline) => Cow::Owned(remote_timeline),
|
||||
Err(e) => {
|
||||
error!("Failed to download full timeline index: {:?}", e);
|
||||
|
||||
return match remote_disk_consistent_lsn {
|
||||
Some(_) => {
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Download(download),
|
||||
));
|
||||
DownloadedTimeline::FailedAndRescheduled
|
||||
}
|
||||
None => {
|
||||
error!("Cannot download: no disk consistent Lsn is present for the index entry");
|
||||
DownloadedTimeline::Abort
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
if remote_timeline.checkpoints().max().is_none() {
|
||||
debug!("Cannot download: no disk consistent Lsn is present for the remote timeline");
|
||||
return DownloadedTimeline::Abort;
|
||||
};
|
||||
|
||||
debug!("Downloading timeline archives");
|
||||
let archives_to_download = remote_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.filter(|remote_archive| !download.archives_to_skip.contains(remote_archive))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let archives_total = archives_to_download.len();
|
||||
debug!("Downloading {} archives of a timeline", archives_total);
|
||||
trace!("Archives to download: {:?}", archives_to_download);
|
||||
|
||||
for (archives_downloaded, archive_id) in archives_to_download.into_iter().enumerate() {
|
||||
match try_download_archive(
|
||||
conf,
|
||||
sync_id,
|
||||
Arc::clone(&remote_assets),
|
||||
&remote_timeline,
|
||||
archive_id,
|
||||
Arc::clone(&download.files_to_skip),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(e) => {
|
||||
let archives_left = archives_total - archives_downloaded;
|
||||
error!(
|
||||
"Failed to download archive {:?} (archives downloaded: {}; archives left: {}) for tenant {} timeline {}, requeueing the download: {:?}",
|
||||
archive_id, archives_downloaded, archives_left, tenant_id, timeline_id, e
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Download(download),
|
||||
));
|
||||
return DownloadedTimeline::FailedAndRescheduled;
|
||||
}
|
||||
Ok(()) => {
|
||||
debug!("Successfully downloaded archive {:?}", archive_id);
|
||||
download.archives_to_skip.insert(archive_id);
|
||||
}
|
||||
}
|
||||
errors_happened = true;
|
||||
}
|
||||
|
||||
if errors_happened {
|
||||
debug!("Reenqueuing failed download task for timeline {sync_id}");
|
||||
download_data.retries += 1;
|
||||
sync_queue::push(sync_id, SyncTask::Download(download_data));
|
||||
DownloadedTimeline::FailedAndRescheduled
|
||||
} else {
|
||||
info!("Successfully downloaded all layers");
|
||||
DownloadedTimeline::Successful(download_data)
|
||||
}
|
||||
debug!("Finished downloading all timeline's archives");
|
||||
DownloadedTimeline::Successful
|
||||
}
|
||||
|
||||
async fn fsync_path(path: impl AsRef<Path>) -> Result<(), io::Error> {
|
||||
fs::File::open(path).await?.sync_all().await
|
||||
async fn try_download_archive<
|
||||
P: Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
conf: &'static PageServerConf,
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: ZTenantTimelineId,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
remote_timeline: &RemoteTimeline,
|
||||
archive_id: ArchiveId,
|
||||
files_to_skip: Arc<BTreeSet<PathBuf>>,
|
||||
) -> anyhow::Result<()> {
|
||||
debug!("Downloading archive {:?}", archive_id);
|
||||
let archive_to_download = remote_timeline
|
||||
.archive_data(archive_id)
|
||||
.with_context(|| format!("Archive {:?} not found in remote storage", archive_id))?;
|
||||
let (archive_header, header_size) = remote_timeline
|
||||
.restore_header(archive_id)
|
||||
.context("Failed to restore header when downloading an archive")?;
|
||||
|
||||
match read_local_metadata(conf, timeline_id, tenant_id).await {
|
||||
Ok(local_metadata) => ensure!(
|
||||
// need to allow `<=` instead of `<` due to cases when a failed archive can be redownloaded
|
||||
local_metadata.disk_consistent_lsn() <= archive_to_download.disk_consistent_lsn(),
|
||||
"Cannot download archive with Lsn {} since it's earlier than local Lsn {}",
|
||||
archive_to_download.disk_consistent_lsn(),
|
||||
local_metadata.disk_consistent_lsn()
|
||||
),
|
||||
Err(e) => warn!("Failed to read local metadata file, assuming it's safe to override its with the download. Read: {:#}", e),
|
||||
}
|
||||
compression::uncompress_file_stream_with_index(
|
||||
conf.timeline_path(&timeline_id, &tenant_id),
|
||||
files_to_skip,
|
||||
archive_to_download.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
move |mut archive_target, archive_name| async move {
|
||||
let archive_local_path = conf
|
||||
.timeline_path(&timeline_id, &tenant_id)
|
||||
.join(&archive_name);
|
||||
let remote_storage = &remote_assets.0;
|
||||
remote_storage
|
||||
.download_range(
|
||||
&remote_storage.storage_path(&archive_local_path)?,
|
||||
header_size,
|
||||
None,
|
||||
&mut archive_target,
|
||||
)
|
||||
.await
|
||||
},
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_local_metadata(
|
||||
conf: &'static PageServerConf,
|
||||
timeline_id: zenith_utils::zid::ZTimelineId,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<TimelineMetadata> {
|
||||
let local_metadata_path = metadata_path(conf, timeline_id, tenant_id);
|
||||
let local_metadata_bytes = fs::read(&local_metadata_path)
|
||||
.await
|
||||
.context("Failed to read local metadata file bytes")?;
|
||||
TimelineMetadata::from_bytes(&local_metadata_bytes)
|
||||
.context("Failed to read local metadata files bytes")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use tempfile::tempdir;
|
||||
use utils::lsn::Lsn;
|
||||
use tokio::fs;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
local_fs::LocalFs,
|
||||
storage_sync::test_utils::{
|
||||
assert_index_descriptions, assert_timeline_files_match, create_local_timeline,
|
||||
dummy_metadata, ensure_correct_timeline_upload, expect_timeline,
|
||||
},
|
||||
LocalFs,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
@@ -279,187 +251,80 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_timeline() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let layer_files = ["a", "b", "layer_to_skip", "layer_to_keep_locally"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
|
||||
for local_path in timeline_upload.layers_to_upload {
|
||||
let remote_path = storage.storage_path(&local_path)?;
|
||||
let remote_parent_dir = remote_path.parent().unwrap();
|
||||
if !remote_parent_dir.exists() {
|
||||
fs::create_dir_all(&remote_parent_dir).await?;
|
||||
}
|
||||
fs::copy(&local_path, &remote_path).await?;
|
||||
}
|
||||
let mut read_dir = fs::read_dir(&local_timeline_path).await?;
|
||||
while let Some(dir_entry) = read_dir.next_entry().await? {
|
||||
if dir_entry.file_name().to_str() == Some("layer_to_keep_locally") {
|
||||
continue;
|
||||
} else {
|
||||
fs::remove_file(dir_entry.path()).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut remote_timeline = RemoteTimeline::new(metadata.clone());
|
||||
remote_timeline.awaits_download = true;
|
||||
remote_timeline.add_timeline_layers(
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer| local_timeline_path.join(layer)),
|
||||
);
|
||||
|
||||
let download_data = match download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
Some(&remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
current_retries,
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::from([local_timeline_path.join("layer_to_skip")]),
|
||||
},
|
||||
),
|
||||
)
|
||||
.await
|
||||
{
|
||||
DownloadedTimeline::Successful(data) => data,
|
||||
wrong_result => {
|
||||
panic!("Expected a successful download for timeline, but got: {wrong_result:?}")
|
||||
}
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
current_retries, download_data.retries,
|
||||
"On successful download, retries are not expected to change"
|
||||
);
|
||||
assert_eq!(
|
||||
download_data
|
||||
.data
|
||||
.layers_to_skip
|
||||
async fn test_download_timeline() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("test_download_timeline")?;
|
||||
let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer| local_timeline_path.join(layer))
|
||||
.collect(),
|
||||
"On successful download, layers to skip should contain all downloaded files and present layers that were skipped"
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
);
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let storage = &remote_assets.0;
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let mut downloaded_files = BTreeSet::new();
|
||||
let mut read_dir = fs::read_dir(&local_timeline_path).await?;
|
||||
while let Some(dir_entry) = read_dir.next_entry().await? {
|
||||
downloaded_files.insert(dir_entry.path());
|
||||
}
|
||||
let regular_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
let regular_timeline = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
dummy_metadata(Lsn(0x30)),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
regular_timeline,
|
||||
)
|
||||
.await;
|
||||
// upload multiple checkpoints for the same timeline
|
||||
let regular_timeline = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["c", "d"],
|
||||
dummy_metadata(Lsn(0x40)),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
regular_timeline,
|
||||
)
|
||||
.await;
|
||||
|
||||
assert_eq!(
|
||||
downloaded_files,
|
||||
layer_files
|
||||
.iter()
|
||||
.filter(|layer| layer != &&"layer_to_skip")
|
||||
.map(|layer| local_timeline_path.join(layer))
|
||||
.collect(),
|
||||
"On successful download, all layers that were not skipped, should be downloaded"
|
||||
);
|
||||
fs::remove_dir_all(®ular_timeline_path).await?;
|
||||
let remote_regular_timeline = expect_timeline(index, sync_id).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn download_timeline_negatives() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("download_timeline_negatives")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
|
||||
let empty_remote_timeline_download = download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
None,
|
||||
download_timeline(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
TimelineDownload {
|
||||
files_to_skip: Arc::new(BTreeSet::new()),
|
||||
archives_to_skip: BTreeSet::new(),
|
||||
},
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(
|
||||
index,
|
||||
&RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
remote_assets
|
||||
.0
|
||||
.list()
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
matches!(empty_remote_timeline_download, DownloadedTimeline::Abort),
|
||||
"Should not allow downloading for empty remote timeline"
|
||||
);
|
||||
|
||||
let not_expecting_download_remote_timeline = RemoteTimeline::new(dummy_metadata(Lsn(5)));
|
||||
assert!(
|
||||
!not_expecting_download_remote_timeline.awaits_download,
|
||||
"Should not expect download for the timeline"
|
||||
);
|
||||
let already_downloading_remote_timeline_download = download_timeline_layers(
|
||||
harness.conf,
|
||||
&storage,
|
||||
Some(¬_expecting_download_remote_timeline),
|
||||
sync_id,
|
||||
SyncData::new(
|
||||
0,
|
||||
TimelineDownload {
|
||||
layers_to_skip: HashSet::new(),
|
||||
},
|
||||
),
|
||||
)
|
||||
.await;
|
||||
assert!(
|
||||
matches!(
|
||||
already_downloading_remote_timeline_download,
|
||||
DownloadedTimeline::Abort,
|
||||
),
|
||||
"Should not allow downloading for remote timeline that does not expect it"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_download_index_part() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("test_download_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
HashSet::from([
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?,
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?,
|
||||
]),
|
||||
HashSet::from([RelativePath::new(
|
||||
&local_timeline_path,
|
||||
local_timeline_path.join("three"),
|
||||
)?]),
|
||||
metadata.disk_consistent_lsn(),
|
||||
metadata.to_bytes()?,
|
||||
);
|
||||
|
||||
let local_index_part_path =
|
||||
metadata_path(harness.conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let storage_path = storage.storage_path(&local_index_part_path)?;
|
||||
fs::create_dir_all(storage_path.parent().unwrap()).await?;
|
||||
fs::write(&storage_path, serde_json::to_vec(&index_part)?).await?;
|
||||
|
||||
let downloaded_index_part = download_index_part(harness.conf, &storage, sync_id).await?;
|
||||
|
||||
assert_eq!(
|
||||
downloaded_index_part, index_part,
|
||||
"Downloaded index part should be the same as the one in storage"
|
||||
);
|
||||
assert_timeline_files_match(&repo_harness, TIMELINE_ID, remote_regular_timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,53 +1,63 @@
|
||||
//! In-memory index to track the tenant files on the remote storage.
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
//! In-memory index to track the tenant files on the remote strorage, mitigating the storage format differences between the local and remote files.
|
||||
//! Able to restore itself from the storage archive data and reconstruct archive indices on demand.
|
||||
//!
|
||||
//! The index is intended to be portable, so deliberately does not store any local paths inside.
|
||||
//! This way in the future, the index could be restored fast from its serialized stored form.
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
collections::{BTreeMap, BTreeSet, HashMap},
|
||||
path::{Path, PathBuf},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::{Context, Ok};
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use tokio::sync::RwLock;
|
||||
use tracing::*;
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata};
|
||||
use utils::{lsn::Lsn, zid::ZTenantTimelineId};
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::TIMELINES_SEGMENT_NAME,
|
||||
remote_storage::{
|
||||
storage_sync::compression::{parse_archive_name, FileEntry},
|
||||
ZTenantTimelineId,
|
||||
},
|
||||
};
|
||||
|
||||
use super::compression::ArchiveHeader;
|
||||
|
||||
/// A part of the filesystem path, that needs a root to become a path again.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
#[serde(transparent)]
|
||||
pub struct RelativePath(String);
|
||||
|
||||
impl RelativePath {
|
||||
/// Attempts to strip off the base from path, producing a relative path or an error.
|
||||
pub fn new<P: AsRef<Path>>(base: &Path, path: P) -> anyhow::Result<Self> {
|
||||
let path = path.as_ref();
|
||||
let relative = path.strip_prefix(base).with_context(|| {
|
||||
format!(
|
||||
"path '{}' is not relative to base '{}'",
|
||||
path.display(),
|
||||
base.display()
|
||||
)
|
||||
})?;
|
||||
let relative = path
|
||||
.as_ref()
|
||||
.strip_prefix(base)
|
||||
.context("path is not relative to base")?;
|
||||
Ok(RelativePath(relative.to_string_lossy().to_string()))
|
||||
}
|
||||
|
||||
/// Joins the relative path with the base path.
|
||||
fn as_path(&self, base: &Path) -> PathBuf {
|
||||
pub fn as_path(&self, base: &Path) -> PathBuf {
|
||||
base.join(&self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// An index to track tenant files that exist on the remote storage.
|
||||
/// Currently, timeline archive files are tracked only.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteTimelineIndex {
|
||||
timeline_entries: HashMap<ZTenantTimelineId, RemoteTimeline>,
|
||||
timeline_entries: HashMap<ZTenantTimelineId, TimelineIndexEntry>,
|
||||
}
|
||||
|
||||
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||
/// A wrapper to synchrnize access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||
pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
|
||||
|
||||
impl RemoteIndex {
|
||||
@@ -57,22 +67,27 @@ impl RemoteIndex {
|
||||
})))
|
||||
}
|
||||
|
||||
pub fn from_parts(
|
||||
/// Attempts to parse file paths (not checking the file contents) and find files
|
||||
/// that can be tracked wiht the index.
|
||||
/// On parse falures, logs the error and continues, so empty index can be created from not suitable paths.
|
||||
pub fn try_parse_descriptions_from_paths<P: AsRef<Path>>(
|
||||
conf: &'static PageServerConf,
|
||||
index_parts: HashMap<ZTenantTimelineId, IndexPart>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut timeline_entries = HashMap::new();
|
||||
|
||||
for (sync_id, index_part) in index_parts {
|
||||
let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.context("Failed to restore remote timeline data from index part")?;
|
||||
timeline_entries.insert(sync_id, remote_timeline);
|
||||
paths: impl Iterator<Item = P>,
|
||||
) -> Self {
|
||||
let mut index = RemoteTimelineIndex {
|
||||
timeline_entries: HashMap::new(),
|
||||
};
|
||||
for path in paths {
|
||||
if let Err(e) = try_parse_index_entry(&mut index, conf, path.as_ref()) {
|
||||
debug!(
|
||||
"Failed to parse path '{}' as index entry: {:#}",
|
||||
path.as_ref().display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex {
|
||||
timeline_entries,
|
||||
}))))
|
||||
Self(Arc::new(RwLock::new(index)))
|
||||
}
|
||||
|
||||
pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
|
||||
@@ -91,18 +106,39 @@ impl Clone for RemoteIndex {
|
||||
}
|
||||
|
||||
impl RemoteTimelineIndex {
|
||||
pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> {
|
||||
pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&TimelineIndexEntry> {
|
||||
self.timeline_entries.get(id)
|
||||
}
|
||||
|
||||
pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> {
|
||||
pub fn timeline_entry_mut(
|
||||
&mut self,
|
||||
id: &ZTenantTimelineId,
|
||||
) -> Option<&mut TimelineIndexEntry> {
|
||||
self.timeline_entries.get_mut(id)
|
||||
}
|
||||
|
||||
pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) {
|
||||
pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: TimelineIndexEntry) {
|
||||
self.timeline_entries.insert(id, entry);
|
||||
}
|
||||
|
||||
pub fn upgrade_timeline_entry(
|
||||
&mut self,
|
||||
id: &ZTenantTimelineId,
|
||||
remote_timeline: RemoteTimeline,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut entry = self.timeline_entries.get_mut(id).ok_or(anyhow::anyhow!(
|
||||
"timeline is unexpectedly missing from remote index"
|
||||
))?;
|
||||
|
||||
if !matches!(entry.inner, TimelineIndexEntryInner::Description(_)) {
|
||||
anyhow::bail!("timeline entry is not a description entry")
|
||||
};
|
||||
|
||||
entry.inner = TimelineIndexEntryInner::Full(remote_timeline);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
|
||||
self.timeline_entries.keys().copied()
|
||||
}
|
||||
@@ -114,295 +150,351 @@ impl RemoteTimelineIndex {
|
||||
) -> anyhow::Result<()> {
|
||||
self.timeline_entry_mut(id)
|
||||
.ok_or_else(|| anyhow::anyhow!("unknown timeline sync {}", id))?
|
||||
.awaits_download = awaits_download;
|
||||
.set_awaits_download(awaits_download);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Restored index part data about the timeline, stored in the remote index.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct RemoteTimeline {
|
||||
timeline_layers: HashSet<PathBuf>,
|
||||
missing_layers: HashSet<PathBuf>,
|
||||
|
||||
pub metadata: TimelineMetadata,
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Default)]
|
||||
pub struct DescriptionTimelineIndexEntry {
|
||||
pub description: BTreeMap<ArchiveId, ArchiveDescription>,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
impl RemoteTimeline {
|
||||
pub fn new(metadata: TimelineMetadata) -> Self {
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct FullTimelineIndexEntry {
|
||||
pub remote_timeline: RemoteTimeline,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum TimelineIndexEntryInner {
|
||||
Description(BTreeMap<ArchiveId, ArchiveDescription>),
|
||||
Full(RemoteTimeline),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TimelineIndexEntry {
|
||||
inner: TimelineIndexEntryInner,
|
||||
awaits_download: bool,
|
||||
}
|
||||
|
||||
impl TimelineIndexEntry {
|
||||
pub fn new(inner: TimelineIndexEntryInner, awaits_download: bool) -> Self {
|
||||
Self {
|
||||
timeline_layers: HashSet::new(),
|
||||
missing_layers: HashSet::new(),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
inner,
|
||||
awaits_download,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_timeline_layers(&mut self, new_layers: impl IntoIterator<Item = PathBuf>) {
|
||||
self.timeline_layers.extend(new_layers.into_iter());
|
||||
pub fn inner(&self) -> &TimelineIndexEntryInner {
|
||||
&self.inner
|
||||
}
|
||||
|
||||
pub fn add_upload_failures(&mut self, upload_failures: impl IntoIterator<Item = PathBuf>) {
|
||||
self.missing_layers.extend(upload_failures.into_iter());
|
||||
pub fn inner_mut(&mut self) -> &mut TimelineIndexEntryInner {
|
||||
&mut self.inner
|
||||
}
|
||||
|
||||
pub fn uploaded_checkpoints(&self) -> BTreeSet<Lsn> {
|
||||
match &self.inner {
|
||||
TimelineIndexEntryInner::Description(description) => {
|
||||
description.keys().map(|archive_id| archive_id.0).collect()
|
||||
}
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
|
||||
.checkpoint_archives
|
||||
.keys()
|
||||
.map(|archive_id| archive_id.0)
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets latest uploaded checkpoint's disk consisten Lsn for the corresponding timeline.
|
||||
pub fn disk_consistent_lsn(&self) -> Option<Lsn> {
|
||||
match &self.inner {
|
||||
TimelineIndexEntryInner::Description(description) => {
|
||||
description.keys().map(|archive_id| archive_id.0).max()
|
||||
}
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => remote_timeline
|
||||
.checkpoint_archives
|
||||
.keys()
|
||||
.map(|archive_id| archive_id.0)
|
||||
.max(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_awaits_download(&self) -> bool {
|
||||
self.awaits_download
|
||||
}
|
||||
|
||||
pub fn set_awaits_download(&mut self, awaits_download: bool) {
|
||||
self.awaits_download = awaits_download;
|
||||
}
|
||||
}
|
||||
|
||||
/// Checkpoint archive's id, corresponding to the `disk_consistent_lsn` from the timeline's metadata file during checkpointing.
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
pub struct ArchiveId(pub(super) Lsn);
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)]
|
||||
struct FileId(ArchiveId, ArchiveEntryNumber);
|
||||
|
||||
type ArchiveEntryNumber = usize;
|
||||
|
||||
/// All archives and files in them, representing a certain timeline.
|
||||
/// Uses file and archive IDs to reference those without ownership issues.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct RemoteTimeline {
|
||||
timeline_files: BTreeMap<FileId, FileEntry>,
|
||||
checkpoint_archives: BTreeMap<ArchiveId, CheckpointArchive>,
|
||||
}
|
||||
|
||||
/// Archive metadata, enough to restore a header with the timeline data.
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
pub struct CheckpointArchive {
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_file_size: u64,
|
||||
files: BTreeSet<FileId>,
|
||||
archive_header_size: u64,
|
||||
}
|
||||
|
||||
impl CheckpointArchive {
|
||||
pub fn disk_consistent_lsn(&self) -> Lsn {
|
||||
self.disk_consistent_lsn
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteTimeline {
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
timeline_files: BTreeMap::new(),
|
||||
checkpoint_archives: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn checkpoints(&self) -> impl Iterator<Item = Lsn> + '_ {
|
||||
self.checkpoint_archives
|
||||
.values()
|
||||
.map(CheckpointArchive::disk_consistent_lsn)
|
||||
}
|
||||
|
||||
/// Lists all layer files in the given remote timeline. Omits the metadata file.
|
||||
pub fn stored_files(&self) -> &HashSet<PathBuf> {
|
||||
&self.timeline_layers
|
||||
pub fn stored_files(&self, timeline_dir: &Path) -> BTreeSet<PathBuf> {
|
||||
self.timeline_files
|
||||
.values()
|
||||
.map(|file_entry| file_entry.subpath.as_path(timeline_dir))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn from_index_part(timeline_path: &Path, index_part: IndexPart) -> anyhow::Result<Self> {
|
||||
let metadata = TimelineMetadata::from_bytes(&index_part.metadata_bytes)?;
|
||||
Ok(Self {
|
||||
timeline_layers: to_local_paths(timeline_path, index_part.timeline_layers),
|
||||
missing_layers: to_local_paths(timeline_path, index_part.missing_layers),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
})
|
||||
pub fn contains_checkpoint_at(&self, disk_consistent_lsn: Lsn) -> bool {
|
||||
self.checkpoint_archives
|
||||
.contains_key(&ArchiveId(disk_consistent_lsn))
|
||||
}
|
||||
}
|
||||
|
||||
/// Part of the remote index, corresponding to a certain timeline.
|
||||
/// Contains the data about all files in the timeline, present remotely and its metadata.
|
||||
#[serde_as]
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
|
||||
pub struct IndexPart {
|
||||
timeline_layers: HashSet<RelativePath>,
|
||||
/// Currently is not really used in pageserver,
|
||||
/// present to manually keep track of the layer files that pageserver might never retrieve.
|
||||
///
|
||||
/// Such "holes" might appear if any upload task was evicted on an error threshold:
|
||||
/// the this layer will only be rescheduled for upload on pageserver restart.
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
}
|
||||
pub fn archive_data(&self, archive_id: ArchiveId) -> Option<&CheckpointArchive> {
|
||||
self.checkpoint_archives.get(&archive_id)
|
||||
}
|
||||
|
||||
impl IndexPart {
|
||||
pub const FILE_NAME: &'static str = "index_part";
|
||||
pub const FILE_EXTENSION: &'static str = "json";
|
||||
/// Restores a header of a certain remote archive from the memory data.
|
||||
/// Returns the header and its compressed size in the archive, both can be used to uncompress that archive.
|
||||
pub fn restore_header(&self, archive_id: ArchiveId) -> anyhow::Result<(ArchiveHeader, u64)> {
|
||||
let archive = self
|
||||
.checkpoint_archives
|
||||
.get(&archive_id)
|
||||
.with_context(|| format!("Archive {:?} not found", archive_id))?;
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn new(
|
||||
timeline_layers: HashSet<RelativePath>,
|
||||
missing_layers: HashSet<RelativePath>,
|
||||
let mut header_files = Vec::with_capacity(archive.files.len());
|
||||
for (expected_archive_position, archive_file) in archive.files.iter().enumerate() {
|
||||
let &FileId(archive_id, archive_position) = archive_file;
|
||||
ensure!(
|
||||
expected_archive_position == archive_position,
|
||||
"Archive header is corrupt, file # {} from archive {:?} header is missing",
|
||||
expected_archive_position,
|
||||
archive_id,
|
||||
);
|
||||
|
||||
let timeline_file = self.timeline_files.get(archive_file).with_context(|| {
|
||||
format!(
|
||||
"File with id {:?} not found for archive {:?}",
|
||||
archive_file, archive_id
|
||||
)
|
||||
})?;
|
||||
header_files.push(timeline_file.clone());
|
||||
}
|
||||
|
||||
Ok((
|
||||
ArchiveHeader {
|
||||
files: header_files,
|
||||
metadata_file_size: archive.metadata_file_size,
|
||||
},
|
||||
archive.archive_header_size,
|
||||
))
|
||||
}
|
||||
|
||||
/// Updates (creates, if necessary) the data about certain archive contents.
|
||||
pub fn update_archive_contents(
|
||||
&mut self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
metadata_bytes: Vec<u8>,
|
||||
) -> Self {
|
||||
Self {
|
||||
timeline_layers,
|
||||
missing_layers,
|
||||
disk_consistent_lsn,
|
||||
metadata_bytes,
|
||||
header: ArchiveHeader,
|
||||
header_size: u64,
|
||||
) {
|
||||
let archive_id = ArchiveId(disk_consistent_lsn);
|
||||
let mut common_archive_files = BTreeSet::new();
|
||||
for (file_index, file_entry) in header.files.into_iter().enumerate() {
|
||||
let file_id = FileId(archive_id, file_index);
|
||||
self.timeline_files.insert(file_id, file_entry);
|
||||
common_archive_files.insert(file_id);
|
||||
}
|
||||
|
||||
let metadata_file_size = header.metadata_file_size;
|
||||
self.checkpoint_archives
|
||||
.entry(archive_id)
|
||||
.or_insert_with(|| CheckpointArchive {
|
||||
metadata_file_size,
|
||||
files: BTreeSet::new(),
|
||||
archive_header_size: header_size,
|
||||
disk_consistent_lsn,
|
||||
})
|
||||
.files
|
||||
.extend(common_archive_files.into_iter());
|
||||
}
|
||||
}
|
||||
|
||||
/// Metadata abput timeline checkpoint archive, parsed from its remote storage path.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ArchiveDescription {
|
||||
pub header_size: u64,
|
||||
pub disk_consistent_lsn: Lsn,
|
||||
pub archive_name: String,
|
||||
}
|
||||
|
||||
fn try_parse_index_entry(
|
||||
index: &mut RemoteTimelineIndex,
|
||||
conf: &'static PageServerConf,
|
||||
path: &Path,
|
||||
) -> anyhow::Result<()> {
|
||||
let tenants_dir = conf.tenants_path();
|
||||
let tenant_id = path
|
||||
.strip_prefix(&tenants_dir)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Path '{}' does not belong to tenants directory '{}'",
|
||||
path.display(),
|
||||
tenants_dir.display(),
|
||||
)
|
||||
})?
|
||||
.iter()
|
||||
.next()
|
||||
.with_context(|| format!("Found no tenant id in path '{}'", path.display()))?
|
||||
.to_string_lossy()
|
||||
.parse::<ZTenantId>()
|
||||
.with_context(|| format!("Failed to parse tenant id from path '{}'", path.display()))?;
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant_id);
|
||||
match path.strip_prefix(&timelines_path) {
|
||||
Ok(timelines_subpath) => {
|
||||
let mut segments = timelines_subpath.iter();
|
||||
let timeline_id = segments
|
||||
.next()
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"{} directory of tenant {} (path '{}') is not an index entry",
|
||||
TIMELINES_SEGMENT_NAME,
|
||||
tenant_id,
|
||||
path.display()
|
||||
)
|
||||
})?
|
||||
.to_string_lossy()
|
||||
.parse::<ZTimelineId>()
|
||||
.with_context(|| {
|
||||
format!("Failed to parse timeline id from path '{}'", path.display())
|
||||
})?;
|
||||
|
||||
let (disk_consistent_lsn, header_size) =
|
||||
parse_archive_name(path).with_context(|| {
|
||||
format!(
|
||||
"Failed to parse archive name out in path '{}'",
|
||||
path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let archive_name = path
|
||||
.file_name()
|
||||
.with_context(|| format!("Archive '{}' has no file name", path.display()))?
|
||||
.to_string_lossy()
|
||||
.to_string();
|
||||
|
||||
let sync_id = ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let timeline_index_entry = index.timeline_entries.entry(sync_id).or_insert_with(|| {
|
||||
TimelineIndexEntry::new(
|
||||
TimelineIndexEntryInner::Description(BTreeMap::default()),
|
||||
false,
|
||||
)
|
||||
});
|
||||
match timeline_index_entry.inner_mut() {
|
||||
TimelineIndexEntryInner::Description(description) => {
|
||||
description.insert(
|
||||
ArchiveId(disk_consistent_lsn),
|
||||
ArchiveDescription {
|
||||
header_size,
|
||||
disk_consistent_lsn,
|
||||
archive_name,
|
||||
},
|
||||
);
|
||||
}
|
||||
TimelineIndexEntryInner::Full(_) => {
|
||||
bail!("Cannot add parsed archive description to its full context in index with sync id {}", sync_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(timelines_strip_error) => {
|
||||
bail!(
|
||||
"Path '{}' is not an archive entry '{}'",
|
||||
path.display(),
|
||||
timelines_strip_error,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn missing_files(&self) -> &HashSet<RelativePath> {
|
||||
&self.missing_layers
|
||||
}
|
||||
|
||||
pub fn from_remote_timeline(
|
||||
timeline_path: &Path,
|
||||
remote_timeline: RemoteTimeline,
|
||||
) -> anyhow::Result<Self> {
|
||||
let metadata_bytes = remote_timeline.metadata.to_bytes()?;
|
||||
Ok(Self {
|
||||
timeline_layers: to_relative_paths(timeline_path, remote_timeline.timeline_layers)
|
||||
.context("Failed to convert timeline layers' paths to relative ones")?,
|
||||
missing_layers: to_relative_paths(timeline_path, remote_timeline.missing_layers)
|
||||
.context("Failed to convert missing layers' paths to relative ones")?,
|
||||
disk_consistent_lsn: remote_timeline.metadata.disk_consistent_lsn(),
|
||||
metadata_bytes,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn to_local_paths(
|
||||
timeline_path: &Path,
|
||||
paths: impl IntoIterator<Item = RelativePath>,
|
||||
) -> HashSet<PathBuf> {
|
||||
paths
|
||||
.into_iter()
|
||||
.map(|path| path.as_path(timeline_path))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn to_relative_paths(
|
||||
timeline_path: &Path,
|
||||
paths: impl IntoIterator<Item = PathBuf>,
|
||||
) -> anyhow::Result<HashSet<RelativePath>> {
|
||||
paths
|
||||
.into_iter()
|
||||
.map(|path| RelativePath::new(timeline_path, path))
|
||||
.collect()
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::repo_harness::{RepoHarness, TIMELINE_ID};
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion() {
|
||||
let harness = RepoHarness::create("index_part_conversion").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
let remote_timeline = RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
timeline_path.join("layer_1"),
|
||||
timeline_path.join("layer_2"),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
timeline_path.join("missing_1"),
|
||||
timeline_path.join("missing_2"),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
fn header_restoration_preserves_file_order() {
|
||||
let header = ArchiveHeader {
|
||||
files: vec![
|
||||
FileEntry {
|
||||
size: 5,
|
||||
subpath: RelativePath("one".to_string()),
|
||||
},
|
||||
FileEntry {
|
||||
size: 1,
|
||||
subpath: RelativePath("two".to_string()),
|
||||
},
|
||||
FileEntry {
|
||||
size: 222,
|
||||
subpath: RelativePath("zero".to_string()),
|
||||
},
|
||||
],
|
||||
metadata_file_size: 5,
|
||||
};
|
||||
|
||||
let index_part = IndexPart::from_remote_timeline(&timeline_path, remote_timeline.clone())
|
||||
.expect("Correct remote timeline should be convertable to index part");
|
||||
let lsn = Lsn(1);
|
||||
let mut remote_timeline = RemoteTimeline::empty();
|
||||
remote_timeline.update_archive_contents(lsn, header.clone(), 15);
|
||||
|
||||
let (restored_header, _) = remote_timeline
|
||||
.restore_header(ArchiveId(lsn))
|
||||
.expect("Should be able to restore header from a valid remote timeline");
|
||||
|
||||
assert_eq!(
|
||||
index_part.timeline_layers.iter().collect::<BTreeSet<_>>(),
|
||||
BTreeSet::from([
|
||||
&RelativePath("layer_1".to_string()),
|
||||
&RelativePath("layer_2".to_string())
|
||||
]),
|
||||
"Index part should have all remote timeline layers after the conversion"
|
||||
header, restored_header,
|
||||
"Header restoration should preserve file order"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.missing_layers.iter().collect::<BTreeSet<_>>(),
|
||||
BTreeSet::from([
|
||||
&RelativePath("missing_1".to_string()),
|
||||
&RelativePath("missing_2".to_string())
|
||||
]),
|
||||
"Index part should have all missing remote timeline layers after the conversion"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.disk_consistent_lsn,
|
||||
metadata.disk_consistent_lsn(),
|
||||
"Index part should have disk consistent lsn from the timeline"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part.metadata_bytes,
|
||||
metadata
|
||||
.to_bytes()
|
||||
.expect("Failed to serialize correct metadata into bytes"),
|
||||
"Index part should have all missing remote timeline layers after the conversion"
|
||||
);
|
||||
|
||||
let restored_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.expect("Correct index part should be convertable to remote timeline");
|
||||
|
||||
let original_metadata = &remote_timeline.metadata;
|
||||
let restored_metadata = &restored_timeline.metadata;
|
||||
// we have to compare the metadata this way, since its header is different after creation and restoration,
|
||||
// but that is now consireded ok.
|
||||
assert_eq!(
|
||||
original_metadata.disk_consistent_lsn(),
|
||||
restored_metadata.disk_consistent_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.prev_record_lsn(),
|
||||
restored_metadata.prev_record_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.ancestor_timeline(),
|
||||
restored_metadata.ancestor_timeline(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.ancestor_lsn(),
|
||||
restored_metadata.ancestor_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.latest_gc_cutoff_lsn(),
|
||||
restored_metadata.latest_gc_cutoff_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
assert_eq!(
|
||||
original_metadata.initdb_lsn(),
|
||||
restored_metadata.initdb_lsn(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not alter metadata"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remote_timeline.awaits_download, restored_timeline.awaits_download,
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose download flag"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
remote_timeline
|
||||
.timeline_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
restored_timeline
|
||||
.timeline_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose layer data"
|
||||
);
|
||||
assert_eq!(
|
||||
remote_timeline
|
||||
.missing_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
restored_timeline
|
||||
.missing_layers
|
||||
.into_iter()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
"remote timeline -> index part -> remote timeline conversion should not loose missing file data"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_part_conversion_negatives() {
|
||||
let harness = RepoHarness::create("index_part_conversion_negatives").unwrap();
|
||||
let timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let metadata =
|
||||
TimelineMetadata::new(Lsn(5).align(), Some(Lsn(4)), None, Lsn(3), Lsn(2), Lsn(1));
|
||||
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
PathBuf::from("bad_path"),
|
||||
timeline_path.join("layer_2"),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
timeline_path.join("missing_1"),
|
||||
timeline_path.join("missing_2"),
|
||||
]),
|
||||
metadata: metadata.clone(),
|
||||
awaits_download: false,
|
||||
},
|
||||
);
|
||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with layer paths that are not in the timeline directory");
|
||||
|
||||
let conversion_result = IndexPart::from_remote_timeline(
|
||||
&timeline_path,
|
||||
RemoteTimeline {
|
||||
timeline_layers: HashSet::from([
|
||||
timeline_path.join("layer_1"),
|
||||
timeline_path.join("layer_2"),
|
||||
]),
|
||||
missing_layers: HashSet::from([
|
||||
PathBuf::from("bad_path"),
|
||||
timeline_path.join("missing_2"),
|
||||
]),
|
||||
metadata,
|
||||
awaits_download: false,
|
||||
},
|
||||
);
|
||||
assert!(conversion_result.is_err(), "Should not be able to convert metadata with missing layer paths that are not in the timeline directory");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,452 +1,513 @@
|
||||
//! Timeline synchronization logic to compress and upload to the remote storage all new timeline files from the checkpoints.
|
||||
|
||||
use std::{fmt::Debug, path::PathBuf};
|
||||
use std::{borrow::Cow, collections::BTreeSet, path::PathBuf, sync::Arc};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use tokio::fs;
|
||||
use tracing::{debug, error, info, warn};
|
||||
use tracing::{debug, error, warn};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::metadata_path,
|
||||
remote_storage::{
|
||||
storage_sync::{index::RemoteTimeline, sync_queue, SyncTask},
|
||||
RemoteStorage,
|
||||
storage_sync::{
|
||||
compression, fetch_full_index,
|
||||
index::{RemoteTimeline, TimelineIndexEntry, TimelineIndexEntryInner},
|
||||
sync_queue, SyncKind, SyncTask,
|
||||
},
|
||||
RemoteStorage, ZTenantTimelineId,
|
||||
},
|
||||
};
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use super::{index::IndexPart, SyncData, TimelineUpload};
|
||||
use super::{compression::ArchiveHeader, NewCheckpoint, RemoteIndex};
|
||||
|
||||
/// Serializes and uploads the given index part data to the remote storage.
|
||||
pub(super) async fn upload_index_part<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
sync_id: ZTenantTimelineId,
|
||||
index_part: IndexPart,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let index_part_bytes = serde_json::to_vec(&index_part)
|
||||
.context("Failed to serialize index part file into bytes")?;
|
||||
let index_part_size = index_part_bytes.len();
|
||||
let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
|
||||
|
||||
let index_part_path = metadata_path(conf, sync_id.timeline_id, sync_id.tenant_id)
|
||||
.with_file_name(IndexPart::FILE_NAME)
|
||||
.with_extension(IndexPart::FILE_EXTENSION);
|
||||
let index_part_storage_path = storage.storage_path(&index_part_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get the index part storage path for local path '{}'",
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
storage
|
||||
.upload(
|
||||
index_part_bytes,
|
||||
index_part_size,
|
||||
&index_part_storage_path,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to upload index part to the storage path '{index_part_storage_path:?}'")
|
||||
})
|
||||
}
|
||||
|
||||
/// Timeline upload result, with extra data, needed for uploading.
|
||||
#[derive(Debug)]
|
||||
pub(super) enum UploadedTimeline {
|
||||
/// Upload failed due to some error, the upload task is rescheduled for another retry.
|
||||
FailedAndRescheduled,
|
||||
/// No issues happened during the upload, all task files were put into the remote storage.
|
||||
Successful(SyncData<TimelineUpload>),
|
||||
/// No failures happened during the upload, but some files were removed locally before the upload task completed
|
||||
/// (could happen due to retries, for instance, if GC happens in the interim).
|
||||
/// Such files are considered "not needed" and ignored, but the task's metadata should be discarded and the new one loaded from the local file.
|
||||
SuccessfulAfterLocalFsUpdate(SyncData<TimelineUpload>),
|
||||
}
|
||||
|
||||
/// Attempts to upload given layer files.
|
||||
/// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
|
||||
/// Attempts to compress and upload given checkpoint files.
|
||||
/// No extra checks for overlapping files is made: download takes care of that, ensuring no non-metadata local timeline files are overwritten.
|
||||
///
|
||||
/// On an error, bumps the retries count and reschedules the entire task.
|
||||
pub(super) async fn upload_timeline_layers<'a, P, S>(
|
||||
storage: &'a S,
|
||||
remote_timeline: Option<&'a RemoteTimeline>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
mut upload_data: SyncData<TimelineUpload>,
|
||||
) -> UploadedTimeline
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
/// On success, populates index data with new downloads.
|
||||
pub(super) async fn upload_timeline_checkpoint<
|
||||
P: std::fmt::Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
{
|
||||
let upload = &mut upload_data.data;
|
||||
let new_upload_lsn = upload.metadata.disk_consistent_lsn();
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
new_checkpoint: NewCheckpoint,
|
||||
retries: u32,
|
||||
) -> Option<bool> {
|
||||
debug!("Uploading checkpoint for sync id {}", sync_id);
|
||||
let new_upload_lsn = new_checkpoint.metadata.disk_consistent_lsn();
|
||||
|
||||
let already_uploaded_layers = remote_timeline
|
||||
.map(|timeline| timeline.stored_files())
|
||||
.cloned()
|
||||
.unwrap_or_default();
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let layers_to_upload = upload
|
||||
.layers_to_upload
|
||||
.difference(&already_uploaded_layers)
|
||||
.cloned()
|
||||
.collect::<Vec<_>>();
|
||||
let ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = sync_id;
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
debug!("Layers to upload: {layers_to_upload:?}");
|
||||
info!(
|
||||
"Uploading {} timeline layers, new lsn: {new_upload_lsn}",
|
||||
layers_to_upload.len(),
|
||||
);
|
||||
|
||||
let mut upload_tasks = layers_to_upload
|
||||
.into_iter()
|
||||
.map(|source_path| async move {
|
||||
let storage_path = storage
|
||||
.storage_path(&source_path)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the layer storage path for local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(UploadError::Other)?;
|
||||
|
||||
let source_file = match fs::File::open(&source_path).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to upen a source file for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(file) => file,
|
||||
Err(e) => return Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
};
|
||||
|
||||
let source_size = source_file
|
||||
.metadata()
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the source file metadata for layer '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(UploadError::Other)?
|
||||
.len() as usize;
|
||||
|
||||
match storage
|
||||
.upload(source_file, source_size, &storage_path, None)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to upload a layer from local path '{}'",
|
||||
source_path.display()
|
||||
)
|
||||
}) {
|
||||
Ok(()) => Ok(source_path),
|
||||
Err(e) => Err(UploadError::MissingLocalFile(source_path, e)),
|
||||
}
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut errors_happened = false;
|
||||
let mut local_fs_updated = false;
|
||||
while let Some(upload_result) = upload_tasks.next().await {
|
||||
match upload_result {
|
||||
Ok(uploaded_path) => {
|
||||
upload.layers_to_upload.remove(&uploaded_path);
|
||||
upload.uploaded_layers.insert(uploaded_path);
|
||||
}
|
||||
Err(e) => match e {
|
||||
UploadError::Other(e) => {
|
||||
errors_happened = true;
|
||||
error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
|
||||
}
|
||||
UploadError::MissingLocalFile(source_path, e) => {
|
||||
if source_path.exists() {
|
||||
errors_happened = true;
|
||||
error!("Failed to upload a layer for timeline {sync_id}: {e:?}");
|
||||
} else {
|
||||
local_fs_updated = true;
|
||||
upload.layers_to_upload.remove(&source_path);
|
||||
warn!(
|
||||
"Missing locally a layer file {} scheduled for upload, skipping",
|
||||
source_path.display()
|
||||
);
|
||||
let index_read = index.read().await;
|
||||
let remote_timeline = match index_read.timeline_entry(&sync_id) {
|
||||
None => None,
|
||||
Some(entry) => match entry.inner() {
|
||||
TimelineIndexEntryInner::Full(remote_timeline) => Some(Cow::Borrowed(remote_timeline)),
|
||||
TimelineIndexEntryInner::Description(_) => {
|
||||
debug!("Found timeline description for the given ids, downloading the full index");
|
||||
match fetch_full_index(remote_assets.as_ref(), &timeline_dir, sync_id).await {
|
||||
Ok(remote_timeline) => Some(Cow::Owned(remote_timeline)),
|
||||
Err(e) => {
|
||||
error!("Failed to download full timeline index: {:?}", e);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Upload(new_checkpoint),
|
||||
));
|
||||
return Some(false);
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
let already_contains_upload_lsn = remote_timeline
|
||||
.as_ref()
|
||||
.map(|remote_timeline| remote_timeline.contains_checkpoint_at(new_upload_lsn))
|
||||
.unwrap_or(false);
|
||||
if already_contains_upload_lsn {
|
||||
warn!(
|
||||
"Received a checkpoint with Lsn {} that's already been uploaded to remote storage, skipping the upload.",
|
||||
new_upload_lsn
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
if errors_happened {
|
||||
debug!("Reenqueuing failed upload task for timeline {sync_id}");
|
||||
upload_data.retries += 1;
|
||||
sync_queue::push(sync_id, SyncTask::Upload(upload_data));
|
||||
UploadedTimeline::FailedAndRescheduled
|
||||
} else if local_fs_updated {
|
||||
info!("Successfully uploaded all layers, some local layers were removed during the upload");
|
||||
UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data)
|
||||
} else {
|
||||
info!("Successfully uploaded all layers");
|
||||
UploadedTimeline::Successful(upload_data)
|
||||
let already_uploaded_files = remote_timeline
|
||||
.map(|timeline| timeline.stored_files(&timeline_dir))
|
||||
.unwrap_or_default();
|
||||
drop(index_read);
|
||||
|
||||
match try_upload_checkpoint(
|
||||
config,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
&new_checkpoint,
|
||||
already_uploaded_files,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(Ok((archive_header, header_size))) => {
|
||||
let mut index_write = index.write().await;
|
||||
match index_write
|
||||
.timeline_entry_mut(&sync_id)
|
||||
.map(|e| e.inner_mut())
|
||||
{
|
||||
None => {
|
||||
let mut new_timeline = RemoteTimeline::empty();
|
||||
new_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
index_write.add_timeline_entry(
|
||||
sync_id,
|
||||
TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
|
||||
)
|
||||
}
|
||||
Some(TimelineIndexEntryInner::Full(remote_timeline)) => {
|
||||
remote_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
}
|
||||
Some(TimelineIndexEntryInner::Description(_)) => {
|
||||
let mut new_timeline = RemoteTimeline::empty();
|
||||
new_timeline.update_archive_contents(
|
||||
new_checkpoint.metadata.disk_consistent_lsn(),
|
||||
archive_header,
|
||||
header_size,
|
||||
);
|
||||
index_write.add_timeline_entry(
|
||||
sync_id,
|
||||
TimelineIndexEntry::new(TimelineIndexEntryInner::Full(new_timeline), false),
|
||||
)
|
||||
}
|
||||
}
|
||||
debug!("Checkpoint uploaded successfully");
|
||||
Some(true)
|
||||
}
|
||||
Some(Err(e)) => {
|
||||
error!(
|
||||
"Failed to upload checkpoint: {:?}, requeueing the upload",
|
||||
e
|
||||
);
|
||||
sync_queue::push(SyncTask::new(
|
||||
sync_id,
|
||||
retries,
|
||||
SyncKind::Upload(new_checkpoint),
|
||||
));
|
||||
Some(false)
|
||||
}
|
||||
None => Some(true),
|
||||
}
|
||||
}
|
||||
|
||||
enum UploadError {
|
||||
MissingLocalFile(PathBuf, anyhow::Error),
|
||||
Other(anyhow::Error),
|
||||
async fn try_upload_checkpoint<
|
||||
P: Send + Sync + 'static,
|
||||
S: RemoteStorage<StoragePath = P> + Send + Sync + 'static,
|
||||
>(
|
||||
config: &'static PageServerConf,
|
||||
remote_assets: Arc<(S, RemoteIndex)>,
|
||||
sync_id: ZTenantTimelineId,
|
||||
new_checkpoint: &NewCheckpoint,
|
||||
files_to_skip: BTreeSet<PathBuf>,
|
||||
) -> Option<anyhow::Result<(ArchiveHeader, u64)>> {
|
||||
let ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = sync_id;
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
|
||||
let files_to_upload = new_checkpoint
|
||||
.layers
|
||||
.iter()
|
||||
.filter(|&path_to_upload| {
|
||||
if files_to_skip.contains(path_to_upload) {
|
||||
warn!(
|
||||
"Skipping file upload '{}', since it was already uploaded",
|
||||
path_to_upload.display()
|
||||
);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if files_to_upload.is_empty() {
|
||||
warn!(
|
||||
"No files to upload. Upload request was: {:?}, already uploaded files: {:?}",
|
||||
new_checkpoint.layers, files_to_skip
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
let upload_result = compression::archive_files_as_stream(
|
||||
&timeline_dir,
|
||||
files_to_upload.into_iter(),
|
||||
&new_checkpoint.metadata,
|
||||
move |archive_streamer, archive_name| async move {
|
||||
let timeline_dir = config.timeline_path(&timeline_id, &tenant_id);
|
||||
let remote_storage = &remote_assets.0;
|
||||
remote_storage
|
||||
.upload(
|
||||
archive_streamer,
|
||||
&remote_storage.storage_path(&timeline_dir.join(&archive_name))?,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
},
|
||||
)
|
||||
.await
|
||||
.map(|(header, header_size, _)| (header, header_size));
|
||||
|
||||
Some(upload_result)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
|
||||
use tempfile::tempdir;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
use crate::{
|
||||
remote_storage::{
|
||||
local_fs::LocalFs,
|
||||
storage_sync::{
|
||||
index::RelativePath,
|
||||
test_utils::{create_local_timeline, dummy_metadata},
|
||||
index::ArchiveId,
|
||||
test_utils::{
|
||||
assert_index_descriptions, create_local_timeline, dummy_metadata,
|
||||
ensure_correct_timeline_upload, expect_timeline,
|
||||
},
|
||||
},
|
||||
LocalFs,
|
||||
},
|
||||
repository::repo_harness::{RepoHarness, TIMELINE_ID},
|
||||
};
|
||||
|
||||
use super::{upload_index_part, *};
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn regular_layer_upload() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("regular_layer_upload")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a", "b"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_path_buf(), &harness.conf.workdir)?;
|
||||
let current_retries = 3;
|
||||
let metadata = dummy_metadata(Lsn(0x30));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layer_files, metadata.clone()).await?;
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
async fn reupload_timeline() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("reupload_timeline")?;
|
||||
let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
);
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
let first_upload_metadata = dummy_metadata(Lsn(0x10));
|
||||
let first_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
let local_timeline_path = repo_harness.timeline_path(&TIMELINE_ID);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
first_checkpoint,
|
||||
)
|
||||
.await;
|
||||
|
||||
let upload_data = match upload_result {
|
||||
UploadedTimeline::Successful(upload_data) => upload_data,
|
||||
wrong_result => {
|
||||
panic!("Expected a successful upload for timeline, but got: {wrong_result:?}")
|
||||
}
|
||||
};
|
||||
|
||||
let uploaded_timeline = expect_timeline(index, sync_id).await;
|
||||
let uploaded_archives = uploaded_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
current_retries, upload_data.retries,
|
||||
"On successful upload, retries are not expected to change"
|
||||
);
|
||||
let upload = &upload_data.data;
|
||||
assert!(
|
||||
upload.layers_to_upload.is_empty(),
|
||||
"Successful upload should have no layers left to upload"
|
||||
);
|
||||
assert_eq!(
|
||||
upload
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_file| local_timeline_path.join(layer_file))
|
||||
.collect(),
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
upload.metadata, metadata,
|
||||
"Successful upload should not chage its metadata"
|
||||
);
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
"All layers should be uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
.map(|file| local_timeline_path.join(file))
|
||||
.collect(),
|
||||
"Uploaded files should match with the local ones"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Currently, GC can run between upload retries, removing local layers scheduled for upload. Test this scenario.
|
||||
#[tokio::test]
|
||||
async fn layer_upload_after_local_fs_update() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("layer_upload_after_local_fs_update")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let layer_files = ["a1", "b1"];
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
let current_retries = 5;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
let layers_to_upload = {
|
||||
let mut layers = layer_files.to_vec();
|
||||
layers.push("layer_to_remove");
|
||||
layers
|
||||
};
|
||||
let timeline_upload =
|
||||
create_local_timeline(&harness, TIMELINE_ID, &layers_to_upload, metadata.clone())
|
||||
.await?;
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
|
||||
fs::remove_file(local_timeline_path.join("layer_to_remove")).await?;
|
||||
|
||||
let upload_result = upload_timeline_layers(
|
||||
&storage,
|
||||
None,
|
||||
sync_id,
|
||||
SyncData::new(current_retries, timeline_upload.clone()),
|
||||
)
|
||||
.await;
|
||||
|
||||
let upload_data = match upload_result {
|
||||
UploadedTimeline::SuccessfulAfterLocalFsUpdate(upload_data) => upload_data,
|
||||
wrong_result => panic!(
|
||||
"Expected a successful after local fs upload for timeline, but got: {wrong_result:?}"
|
||||
),
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
current_retries, upload_data.retries,
|
||||
"On successful upload, retries are not expected to change"
|
||||
);
|
||||
let upload = &upload_data.data;
|
||||
assert!(
|
||||
upload.layers_to_upload.is_empty(),
|
||||
"Successful upload should have no layers left to upload, even those that were removed from the local fs"
|
||||
);
|
||||
assert_eq!(
|
||||
upload
|
||||
.uploaded_layers
|
||||
.iter()
|
||||
.cloned()
|
||||
.collect::<BTreeSet<_>>(),
|
||||
layer_files
|
||||
.iter()
|
||||
.map(|layer_file| local_timeline_path.join(layer_file))
|
||||
.collect(),
|
||||
"Successful upload should have all layers uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
upload.metadata, metadata,
|
||||
"Successful upload should not chage its metadata"
|
||||
);
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
layer_files.len(),
|
||||
"All layers should be uploaded"
|
||||
);
|
||||
assert_eq!(
|
||||
storage_files
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path))
|
||||
.collect::<anyhow::Result<BTreeSet<_>>>()?,
|
||||
layer_files
|
||||
.into_iter()
|
||||
.map(|file| local_timeline_path.join(file))
|
||||
.collect(),
|
||||
"Uploaded files should match with the local ones"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_upload_index_part() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("test_upload_index_part")?;
|
||||
let sync_id = ZTenantTimelineId::new(harness.tenant_id, TIMELINE_ID);
|
||||
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &harness.conf.workdir)?;
|
||||
let metadata = dummy_metadata(Lsn(0x40));
|
||||
let local_timeline_path = harness.timeline_path(&TIMELINE_ID);
|
||||
|
||||
let index_part = IndexPart::new(
|
||||
HashSet::from([
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("one"))?,
|
||||
RelativePath::new(&local_timeline_path, local_timeline_path.join("two"))?,
|
||||
]),
|
||||
HashSet::from([RelativePath::new(
|
||||
&local_timeline_path,
|
||||
local_timeline_path.join("three"),
|
||||
)?]),
|
||||
metadata.disk_consistent_lsn(),
|
||||
metadata.to_bytes()?,
|
||||
);
|
||||
|
||||
assert!(
|
||||
storage.list().await?.is_empty(),
|
||||
"Storage should be empty before any uploads are made"
|
||||
);
|
||||
upload_index_part(harness.conf, &storage, sync_id, index_part.clone()).await?;
|
||||
|
||||
let storage_files = storage.list().await?;
|
||||
assert_eq!(
|
||||
storage_files.len(),
|
||||
uploaded_archives.len(),
|
||||
1,
|
||||
"Should have only the index part file uploaded"
|
||||
"Only one archive is expected after a first upload"
|
||||
);
|
||||
let first_uploaded_archive = uploaded_archives.first().copied().unwrap();
|
||||
assert_eq!(
|
||||
uploaded_timeline.checkpoints().last(),
|
||||
Some(first_upload_metadata.disk_consistent_lsn()),
|
||||
"Metadata that was uploaded, should have its Lsn stored"
|
||||
);
|
||||
assert_eq!(
|
||||
uploaded_timeline
|
||||
.archive_data(uploaded_archives.first().copied().unwrap())
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
uploaded_timeline.stored_files(&local_timeline_path),
|
||||
vec![local_timeline_path.join("a"), local_timeline_path.join("b")]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from the first checkpoint"
|
||||
);
|
||||
|
||||
let index_part_path = storage_files.first().unwrap();
|
||||
let second_upload_metadata = dummy_metadata(Lsn(0x40));
|
||||
let second_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["b", "c"],
|
||||
second_upload_metadata.clone(),
|
||||
)?;
|
||||
assert!(
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
< second_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
second_checkpoint,
|
||||
)
|
||||
.await;
|
||||
|
||||
let updated_timeline = expect_timeline(index, sync_id).await;
|
||||
let mut updated_archives = updated_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
index_part_path.file_stem().and_then(|name| name.to_str()),
|
||||
Some(IndexPart::FILE_NAME),
|
||||
"Remote index part should have the correct name"
|
||||
updated_archives.len(),
|
||||
2,
|
||||
"Two archives are expected after a successful update of the upload"
|
||||
);
|
||||
updated_archives.retain(|archive_id| archive_id != &first_uploaded_archive);
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
1,
|
||||
"Only one new archive is expected among the uploaded"
|
||||
);
|
||||
let second_uploaded_archive = updated_archives.last().copied().unwrap();
|
||||
assert_eq!(
|
||||
updated_timeline.checkpoints().max(),
|
||||
Some(second_upload_metadata.disk_consistent_lsn()),
|
||||
"Metadata that was uploaded, should have its Lsn stored"
|
||||
);
|
||||
assert_eq!(
|
||||
index_part_path
|
||||
.extension()
|
||||
.and_then(|extension| extension.to_str()),
|
||||
Some(IndexPart::FILE_EXTENSION),
|
||||
"Remote index part should have the correct extension"
|
||||
updated_timeline
|
||||
.archive_data(second_uploaded_archive)
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
second_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline.stored_files(&local_timeline_path),
|
||||
vec![
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("b"),
|
||||
local_timeline_path.join("c"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from both checkpoints without duplicates"
|
||||
);
|
||||
|
||||
let remote_index_part: IndexPart =
|
||||
serde_json::from_slice(&fs::read(&index_part_path).await?)?;
|
||||
assert_eq!(
|
||||
index_part, remote_index_part,
|
||||
"Remote index part should match the local one"
|
||||
let third_upload_metadata = dummy_metadata(Lsn(0x20));
|
||||
let third_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["d"],
|
||||
third_upload_metadata.clone(),
|
||||
)?;
|
||||
assert_ne!(
|
||||
third_upload_metadata.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
assert!(
|
||||
third_upload_metadata.disk_consistent_lsn()
|
||||
< second_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
third_checkpoint,
|
||||
)
|
||||
.await;
|
||||
|
||||
let updated_timeline = expect_timeline(index, sync_id).await;
|
||||
let mut updated_archives = updated_timeline
|
||||
.checkpoints()
|
||||
.map(ArchiveId)
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
3,
|
||||
"Three archives are expected after two successful updates of the upload"
|
||||
);
|
||||
updated_archives.retain(|archive_id| {
|
||||
archive_id != &first_uploaded_archive && archive_id != &second_uploaded_archive
|
||||
});
|
||||
assert_eq!(
|
||||
updated_archives.len(),
|
||||
1,
|
||||
"Only one new archive is expected among the uploaded"
|
||||
);
|
||||
let third_uploaded_archive = updated_archives.last().copied().unwrap();
|
||||
assert!(
|
||||
updated_timeline.checkpoints().max().unwrap()
|
||||
> third_upload_metadata.disk_consistent_lsn(),
|
||||
"Should not influence the last lsn by uploading an older checkpoint"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline
|
||||
.archive_data(third_uploaded_archive)
|
||||
.unwrap()
|
||||
.disk_consistent_lsn(),
|
||||
third_upload_metadata.disk_consistent_lsn(),
|
||||
"Uploaded archive should have corresponding Lsn"
|
||||
);
|
||||
assert_eq!(
|
||||
updated_timeline.stored_files(&local_timeline_path),
|
||||
vec![
|
||||
local_timeline_path.join("a"),
|
||||
local_timeline_path.join("b"),
|
||||
local_timeline_path.join("c"),
|
||||
local_timeline_path.join("d"),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
"Should have all files from three checkpoints without duplicates"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reupload_timeline_rejected() -> anyhow::Result<()> {
|
||||
let repo_harness = RepoHarness::create("reupload_timeline_rejected")?;
|
||||
let sync_id = ZTenantTimelineId::new(repo_harness.tenant_id, TIMELINE_ID);
|
||||
let storage = LocalFs::new(tempdir()?.path().to_owned(), &repo_harness.conf.workdir)?;
|
||||
let index = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
storage
|
||||
.list()
|
||||
.await?
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
);
|
||||
let remote_assets = Arc::new((storage, index));
|
||||
let storage = &remote_assets.0;
|
||||
let index = &remote_assets.1;
|
||||
|
||||
let first_upload_metadata = dummy_metadata(Lsn(0x10));
|
||||
let first_checkpoint = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["a", "b"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
ensure_correct_timeline_upload(
|
||||
&repo_harness,
|
||||
Arc::clone(&remote_assets),
|
||||
TIMELINE_ID,
|
||||
first_checkpoint,
|
||||
)
|
||||
.await;
|
||||
let after_first_uploads = RemoteIndex::try_parse_descriptions_from_paths(
|
||||
repo_harness.conf,
|
||||
remote_assets
|
||||
.0
|
||||
.list()
|
||||
.await
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|storage_path| storage.local_path(&storage_path).unwrap()),
|
||||
);
|
||||
|
||||
let normal_upload_metadata = dummy_metadata(Lsn(0x20));
|
||||
assert_ne!(
|
||||
normal_upload_metadata.disk_consistent_lsn(),
|
||||
first_upload_metadata.disk_consistent_lsn()
|
||||
);
|
||||
|
||||
let checkpoint_with_no_files = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&[],
|
||||
normal_upload_metadata.clone(),
|
||||
)?;
|
||||
upload_timeline_checkpoint(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
checkpoint_with_no_files,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(index, &after_first_uploads).await;
|
||||
|
||||
let checkpoint_with_uploaded_lsn = create_local_timeline(
|
||||
&repo_harness,
|
||||
TIMELINE_ID,
|
||||
&["something", "new"],
|
||||
first_upload_metadata.clone(),
|
||||
)?;
|
||||
upload_timeline_checkpoint(
|
||||
repo_harness.conf,
|
||||
Arc::clone(&remote_assets),
|
||||
sync_id,
|
||||
checkpoint_with_uploaded_lsn,
|
||||
0,
|
||||
)
|
||||
.await;
|
||||
assert_index_descriptions(index, &after_first_uploads).await;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -11,10 +11,8 @@ use std::fmt::Display;
|
||||
use std::ops::{AddAssign, Range};
|
||||
use std::sync::{Arc, RwLockReadGuard};
|
||||
use std::time::Duration;
|
||||
use utils::{
|
||||
lsn::{Lsn, RecordLsn},
|
||||
zid::ZTimelineId,
|
||||
};
|
||||
use zenith_utils::lsn::{Lsn, RecordLsn};
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize)]
|
||||
/// Key used in the Repository kv-store.
|
||||
@@ -184,12 +182,14 @@ impl Value {
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum TimelineSyncStatusUpdate {
|
||||
Uploaded,
|
||||
Downloaded,
|
||||
}
|
||||
|
||||
impl Display for TimelineSyncStatusUpdate {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
TimelineSyncStatusUpdate::Uploaded => "Uploaded",
|
||||
TimelineSyncStatusUpdate::Downloaded => "Downloaded",
|
||||
};
|
||||
f.write_str(s)
|
||||
@@ -249,17 +249,14 @@ pub trait Repository: Send + Sync {
|
||||
&self,
|
||||
timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult>;
|
||||
|
||||
/// Perform one compaction iteration.
|
||||
/// This function is periodically called by compactor thread.
|
||||
/// Also it can be explicitly requested per timeline through page server
|
||||
/// api's 'compact' command.
|
||||
/// perform one compaction iteration.
|
||||
/// this function is periodically called by compactor thread.
|
||||
fn compaction_iteration(&self) -> Result<()>;
|
||||
|
||||
/// detaches timeline-related in-memory data.
|
||||
/// detaches locally available timeline by stopping all threads and removing all the data.
|
||||
fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
|
||||
// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
|
||||
@@ -306,7 +303,6 @@ impl<'a, T> From<&'a RepositoryTimeline<T>> for LocalTimelineState {
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
@@ -317,7 +313,6 @@ pub struct GcResult {
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
@@ -435,9 +430,8 @@ pub mod repo_harness {
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
use hex_literal::hex;
|
||||
use utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
pub const TIMELINE_ID: ZTimelineId =
|
||||
ZTimelineId::from_array(hex!("11223344556677881122334455667788"));
|
||||
@@ -458,24 +452,8 @@ pub mod repo_harness {
|
||||
static ref LOCK: RwLock<()> = RwLock::new(());
|
||||
}
|
||||
|
||||
impl From<TenantConf> for TenantConfOpt {
|
||||
fn from(tenant_conf: TenantConf) -> Self {
|
||||
Self {
|
||||
checkpoint_distance: Some(tenant_conf.checkpoint_distance),
|
||||
compaction_target_size: Some(tenant_conf.compaction_target_size),
|
||||
compaction_period: Some(tenant_conf.compaction_period),
|
||||
compaction_threshold: Some(tenant_conf.compaction_threshold),
|
||||
gc_horizon: Some(tenant_conf.gc_horizon),
|
||||
gc_period: Some(tenant_conf.gc_period),
|
||||
image_creation_threshold: Some(tenant_conf.image_creation_threshold),
|
||||
pitr_interval: Some(tenant_conf.pitr_interval),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RepoHarness<'a> {
|
||||
pub conf: &'static PageServerConf,
|
||||
pub tenant_conf: TenantConf,
|
||||
pub tenant_id: ZTenantId,
|
||||
|
||||
pub lock_guard: (
|
||||
@@ -507,15 +485,12 @@ pub mod repo_harness {
|
||||
// OK in a test.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
let tenant_conf = TenantConf::dummy_conf();
|
||||
|
||||
let tenant_id = ZTenantId::generate();
|
||||
fs::create_dir_all(conf.tenant_path(&tenant_id))?;
|
||||
fs::create_dir_all(conf.timelines_path(&tenant_id))?;
|
||||
|
||||
Ok(Self {
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
lock_guard,
|
||||
})
|
||||
@@ -530,7 +505,6 @@ pub mod repo_harness {
|
||||
|
||||
let repo = LayeredRepository::new(
|
||||
self.conf,
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
RemoteIndex::empty(),
|
||||
@@ -746,7 +720,7 @@ mod tests {
|
||||
// FIXME: this doesn't actually remove any layer currently, given how the checkpointing
|
||||
// and compaction works. But it does set the 'cutoff' point so that the cross check
|
||||
// below should fail.
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
||||
@@ -797,7 +771,7 @@ mod tests {
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
|
||||
assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
|
||||
match tline.get(*TEST_KEY, Lsn(0x25)) {
|
||||
@@ -820,7 +794,7 @@ mod tests {
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
// this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
assert!(newtline.get(*TEST_KEY, Lsn(0x25)).is_ok());
|
||||
|
||||
Ok(())
|
||||
@@ -839,7 +813,7 @@ mod tests {
|
||||
make_some_layers(newtline.as_ref(), Lsn(0x60))?;
|
||||
|
||||
// run gc on parent
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, false)?;
|
||||
|
||||
// Check that the data is still accessible on the branch.
|
||||
assert_eq!(
|
||||
|
||||
@@ -1,180 +0,0 @@
|
||||
//! Functions for handling per-tenant configuration options
|
||||
//!
|
||||
//! If tenant is created with --config option,
|
||||
//! the tenant-specific config will be stored in tenant's directory.
|
||||
//! Otherwise, global pageserver's config is used.
|
||||
//!
|
||||
//! If the tenant config file is corrupted, the tenant will be disabled.
|
||||
//! We cannot use global or default config instead, because wrong settings
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
use crate::config::PageServerConf;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use utils::zid::ZTenantId;
|
||||
|
||||
pub const TENANT_CONFIG_NAME: &str = "config";
|
||||
|
||||
pub mod defaults {
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: &str = "100 s";
|
||||
pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
|
||||
}
|
||||
|
||||
/// Per-tenant configuration options
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub struct TenantConf {
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub checkpoint_distance: u64,
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub compaction_target_size: u64,
|
||||
// How often to check if there's compaction work to be done.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub compaction_period: Duration,
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is #of bytes of WAL.
|
||||
// Page versions older than this are garbage collected away.
|
||||
pub gc_horizon: u64,
|
||||
// Interval at which garbage collection is triggered.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub gc_period: Duration,
|
||||
// Delta layer churn threshold to create L1 image layers.
|
||||
pub image_creation_threshold: usize,
|
||||
// Determines how much history is retained, to allow
|
||||
// branching and read replicas at an older point in time.
|
||||
// The unit is time.
|
||||
// Page versions older than this are garbage collected away.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub pitr_interval: Duration,
|
||||
}
|
||||
|
||||
/// Same as TenantConf, but this struct preserves the information about
|
||||
/// which parameters are set and which are not.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
||||
pub struct TenantConfOpt {
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub compaction_period: Option<Duration>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub gc_period: Option<Duration>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub pitr_interval: Option<Duration>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
pub fn merge(&self, global_conf: TenantConf) -> TenantConf {
|
||||
TenantConf {
|
||||
checkpoint_distance: self
|
||||
.checkpoint_distance
|
||||
.unwrap_or(global_conf.checkpoint_distance),
|
||||
compaction_target_size: self
|
||||
.compaction_target_size
|
||||
.unwrap_or(global_conf.compaction_target_size),
|
||||
compaction_period: self
|
||||
.compaction_period
|
||||
.unwrap_or(global_conf.compaction_period),
|
||||
compaction_threshold: self
|
||||
.compaction_threshold
|
||||
.unwrap_or(global_conf.compaction_threshold),
|
||||
gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
|
||||
gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
|
||||
image_creation_threshold: self
|
||||
.image_creation_threshold
|
||||
.unwrap_or(global_conf.image_creation_threshold),
|
||||
pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update(&mut self, other: &TenantConfOpt) {
|
||||
if let Some(checkpoint_distance) = other.checkpoint_distance {
|
||||
self.checkpoint_distance = Some(checkpoint_distance);
|
||||
}
|
||||
if let Some(compaction_target_size) = other.compaction_target_size {
|
||||
self.compaction_target_size = Some(compaction_target_size);
|
||||
}
|
||||
if let Some(compaction_period) = other.compaction_period {
|
||||
self.compaction_period = Some(compaction_period);
|
||||
}
|
||||
if let Some(compaction_threshold) = other.compaction_threshold {
|
||||
self.compaction_threshold = Some(compaction_threshold);
|
||||
}
|
||||
if let Some(gc_horizon) = other.gc_horizon {
|
||||
self.gc_horizon = Some(gc_horizon);
|
||||
}
|
||||
if let Some(gc_period) = other.gc_period {
|
||||
self.gc_period = Some(gc_period);
|
||||
}
|
||||
if let Some(image_creation_threshold) = other.image_creation_threshold {
|
||||
self.image_creation_threshold = Some(image_creation_threshold);
|
||||
}
|
||||
if let Some(pitr_interval) = other.pitr_interval {
|
||||
self.pitr_interval = Some(pitr_interval);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl TenantConf {
|
||||
pub fn default() -> TenantConf {
|
||||
use defaults::*;
|
||||
|
||||
TenantConf {
|
||||
checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE,
|
||||
compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period"),
|
||||
compaction_threshold: DEFAULT_COMPACTION_THRESHOLD,
|
||||
gc_horizon: DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period"),
|
||||
image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
|
||||
.expect("cannot parse default PITR interval"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
pub fn path(conf: &'static PageServerConf, tenantid: ZTenantId) -> PathBuf {
|
||||
conf.tenant_path(&tenantid).join(TENANT_CONFIG_NAME)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf() -> Self {
|
||||
TenantConf {
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: 4 * 1024 * 1024,
|
||||
compaction_period: Duration::from_secs(10),
|
||||
compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
image_creation_threshold: defaults::DEFAULT_IMAGE_CREATION_THRESHOLD,
|
||||
pitr_interval: Duration::from_secs(60 * 60),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,64 +3,34 @@
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::remote_storage::{self, LocalTimelineInitStatus, RemoteIndex, SyncStartupData};
|
||||
use crate::remote_storage::RemoteIndex;
|
||||
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use crate::timelines;
|
||||
use crate::timelines::CreateRepo;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{DatadirTimelineImpl, RepositoryImpl};
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::{Context, Result};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
mod tenants_state {
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{RwLock, RwLockReadGuard, RwLockWriteGuard},
|
||||
};
|
||||
|
||||
use utils::zid::ZTenantId;
|
||||
|
||||
use crate::tenant_mgr::Tenant;
|
||||
|
||||
lazy_static::lazy_static! {
|
||||
static ref TENANTS: RwLock<HashMap<ZTenantId, Tenant>> = RwLock::new(HashMap::new());
|
||||
}
|
||||
|
||||
pub(super) fn read_tenants() -> RwLockReadGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
TENANTS
|
||||
.read()
|
||||
.expect("Failed to read() tenants lock, it got poisoned")
|
||||
}
|
||||
|
||||
pub(super) fn write_tenants() -> RwLockWriteGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
TENANTS
|
||||
.write()
|
||||
.expect("Failed to write() tenants lock, it got poisoned")
|
||||
}
|
||||
lazy_static! {
|
||||
static ref TENANTS: Mutex<HashMap<ZTenantId, Tenant>> = Mutex::new(HashMap::new());
|
||||
}
|
||||
|
||||
struct Tenant {
|
||||
state: TenantState,
|
||||
/// Contains in-memory state, including the timeline that might not yet flushed on disk or loaded form disk.
|
||||
repo: Arc<RepositoryImpl>,
|
||||
/// Timelines, located locally in the pageserver's datadir.
|
||||
/// Timelines can entirely be removed entirely by the `detach` operation only.
|
||||
///
|
||||
/// Local timelines have more metadata that's loaded into memory,
|
||||
/// that is located in the `repo.timelines` field, [`crate::layered_repository::LayeredTimelineEntry`].
|
||||
local_timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
|
||||
|
||||
timelines: HashMap<ZTimelineId, Arc<DatadirTimelineImpl>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
|
||||
@@ -89,23 +59,43 @@ impl fmt::Display for TenantState {
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize repositories with locally available timelines.
|
||||
/// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
/// are scheduled for download and added to the repository once download is completed.
|
||||
pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIndex> {
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = remote_storage::start_local_timeline_sync(conf)
|
||||
.context("Failed to set up local files sync with external storage")?;
|
||||
init_local_repositories(conf, local_timeline_init_statuses, &remote_index)?;
|
||||
Ok(remote_index)
|
||||
fn access_tenants() -> MutexGuard<'static, HashMap<ZTenantId, Tenant>> {
|
||||
TENANTS.lock().unwrap()
|
||||
}
|
||||
|
||||
// Sets up wal redo manager and repository for tenant. Reduces code duplocation.
|
||||
// Used during pageserver startup, or when new tenant is attached to pageserver.
|
||||
pub fn load_local_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> Arc<RepositoryImpl> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
remote_index.clone(),
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
timelines: HashMap::new(),
|
||||
}
|
||||
});
|
||||
Arc::clone(&tenant.repo)
|
||||
}
|
||||
|
||||
/// Updates tenants' repositories, changing their timelines state in memory.
|
||||
pub fn apply_timeline_sync_status_updates(
|
||||
conf: &'static PageServerConf,
|
||||
remote_index: &RemoteIndex,
|
||||
remote_index: RemoteIndex,
|
||||
sync_status_updates: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>,
|
||||
) {
|
||||
if sync_status_updates.is_empty() {
|
||||
@@ -116,21 +106,23 @@ pub fn apply_timeline_sync_status_updates(
|
||||
"Applying sync status updates for {} timelines",
|
||||
sync_status_updates.len()
|
||||
);
|
||||
debug!("Sync status updates: {sync_status_updates:?}");
|
||||
trace!("Sync status updates: {:?}", sync_status_updates);
|
||||
|
||||
for (tenant_id, status_updates) in sync_status_updates {
|
||||
let repo = match load_local_repo(conf, tenant_id, remote_index) {
|
||||
Ok(repo) => repo,
|
||||
Err(e) => {
|
||||
error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",);
|
||||
continue;
|
||||
for (tenant_id, tenant_timelines_sync_status_updates) in sync_status_updates {
|
||||
let repo = load_local_repo(conf, tenant_id, &remote_index);
|
||||
|
||||
for (timeline_id, timeline_sync_status_update) in tenant_timelines_sync_status_updates {
|
||||
match repo.apply_timeline_remote_sync_status_update(timeline_id, timeline_sync_status_update)
|
||||
{
|
||||
Ok(_) => debug!(
|
||||
"successfully applied timeline sync status update: {} -> {}",
|
||||
timeline_id, timeline_sync_status_update
|
||||
),
|
||||
Err(e) => error!(
|
||||
"Failed to apply timeline sync status update for tenant {}. timeline {} update {} Error: {:#}",
|
||||
tenant_id, timeline_id, timeline_sync_status_update, e
|
||||
),
|
||||
}
|
||||
};
|
||||
match apply_timeline_remote_sync_status_updates(&repo, status_updates) {
|
||||
Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"),
|
||||
Err(e) => error!(
|
||||
"Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -139,7 +131,7 @@ pub fn apply_timeline_sync_status_updates(
|
||||
/// Shut down all tenants. This runs as part of pageserver shutdown.
|
||||
///
|
||||
pub fn shutdown_all_tenants() {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
let mut m = access_tenants();
|
||||
let mut tenantids = Vec::new();
|
||||
for (tenantid, tenant) in m.iter_mut() {
|
||||
tenant.state = TenantState::Stopping;
|
||||
@@ -159,16 +151,22 @@ pub fn shutdown_all_tenants() {
|
||||
// should be no more activity in any of the repositories.
|
||||
//
|
||||
// On error, log it but continue with the shutdown for other tenants.
|
||||
for tenant_id in tenantids {
|
||||
debug!("shutdown tenant {tenant_id}");
|
||||
match get_repository_for_tenant(tenant_id) {
|
||||
for tenantid in tenantids {
|
||||
debug!("shutdown tenant {}", tenantid);
|
||||
match get_repository_for_tenant(tenantid) {
|
||||
Ok(repo) => {
|
||||
if let Err(err) = repo.checkpoint() {
|
||||
error!("Could not checkpoint tenant {tenant_id} during shutdown: {err:?}");
|
||||
error!(
|
||||
"Could not checkpoint tenant {} during shutdown: {:?}",
|
||||
tenantid, err
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
error!("Could not get repository for tenant {tenant_id} during shutdown: {err:?}");
|
||||
error!(
|
||||
"Could not get repository for tenant {} during shutdown: {:?}",
|
||||
tenantid, err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -176,21 +174,19 @@ pub fn shutdown_all_tenants() {
|
||||
|
||||
pub fn create_tenant_repository(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: ZTenantId,
|
||||
tenantid: ZTenantId,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<Option<ZTenantId>> {
|
||||
match tenants_state::write_tenants().entry(tenant_id) {
|
||||
) -> Result<Option<ZTenantId>> {
|
||||
match access_tenants().entry(tenantid) {
|
||||
Entry::Occupied(_) => {
|
||||
debug!("tenant {tenant_id} already exists");
|
||||
debug!("tenant {} already exists", tenantid);
|
||||
Ok(None)
|
||||
}
|
||||
Entry::Vacant(v) => {
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
|
||||
let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenantid));
|
||||
let repo = timelines::create_repo(
|
||||
conf,
|
||||
tenant_conf,
|
||||
tenant_id,
|
||||
tenantid,
|
||||
CreateRepo::Real {
|
||||
wal_redo_manager,
|
||||
remote_index,
|
||||
@@ -199,39 +195,28 @@ pub fn create_tenant_repository(
|
||||
v.insert(Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
local_timelines: HashMap::new(),
|
||||
timelines: HashMap::new(),
|
||||
});
|
||||
Ok(Some(tenant_id))
|
||||
Ok(Some(tenantid))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_tenant_config(
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<()> {
|
||||
info!("configuring tenant {tenant_id}");
|
||||
let repo = get_repository_for_tenant(tenant_id)?;
|
||||
|
||||
repo.update_tenant_config(tenant_conf)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_tenant_state(tenantid: ZTenantId) -> Option<TenantState> {
|
||||
Some(tenants_state::read_tenants().get(&tenantid)?.state)
|
||||
Some(access_tenants().get(&tenantid)?.state)
|
||||
}
|
||||
|
||||
///
|
||||
/// Change the state of a tenant to Active and launch its compactor and GC
|
||||
/// threads. If the tenant was already in Active state or Stopping, does nothing.
|
||||
///
|
||||
pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
pub fn activate_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> Result<()> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m
|
||||
.get_mut(&tenant_id)
|
||||
.with_context(|| format!("Tenant not found for id {tenant_id}"))?;
|
||||
.with_context(|| format!("Tenant not found for id {}", tenant_id))?;
|
||||
|
||||
info!("activating tenant {tenant_id}");
|
||||
info!("activating tenant {}", tenant_id);
|
||||
|
||||
match tenant.state {
|
||||
// If the tenant is already active, nothing to do.
|
||||
@@ -245,7 +230,7 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
None,
|
||||
"Compactor thread",
|
||||
true,
|
||||
move || crate::tenant_threads::compact_loop(tenant_id),
|
||||
move || crate::tenant_threads::compact_loop(tenant_id, conf),
|
||||
)?;
|
||||
|
||||
let gc_spawn_result = thread_mgr::spawn(
|
||||
@@ -254,15 +239,19 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
None,
|
||||
"GC thread",
|
||||
true,
|
||||
move || crate::tenant_threads::gc_loop(tenant_id),
|
||||
move || crate::tenant_threads::gc_loop(tenant_id, conf),
|
||||
)
|
||||
.with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));
|
||||
.with_context(|| format!("Failed to launch GC thread for tenant {}", tenant_id));
|
||||
|
||||
if let Err(e) = &gc_spawn_result {
|
||||
error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
|
||||
error!(
|
||||
"Failed to start GC thread for tenant {}, stopping its checkpointer thread: {:?}",
|
||||
tenant_id, e
|
||||
);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
||||
return gc_spawn_result;
|
||||
}
|
||||
|
||||
tenant.state = TenantState::Active;
|
||||
}
|
||||
|
||||
@@ -273,81 +262,39 @@ pub fn activate_tenant(tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn get_repository_for_tenant(tenant_id: ZTenantId) -> anyhow::Result<Arc<RepositoryImpl>> {
|
||||
let m = tenants_state::read_tenants();
|
||||
pub fn get_repository_for_tenant(tenantid: ZTenantId) -> Result<Arc<RepositoryImpl>> {
|
||||
let m = access_tenants();
|
||||
let tenant = m
|
||||
.get(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))?;
|
||||
.get(&tenantid)
|
||||
.with_context(|| format!("Tenant {} not found", tenantid))?;
|
||||
|
||||
Ok(Arc::clone(&tenant.repo))
|
||||
}
|
||||
|
||||
/// Retrieves local timeline for tenant.
|
||||
/// Loads it into memory if it is not already loaded.
|
||||
pub fn get_local_timeline_with_load(
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) -> anyhow::Result<Arc<DatadirTimelineImpl>> {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
// Retrieve timeline for tenant. Load it into memory if it is not already loaded
|
||||
pub fn get_timeline_for_tenant_load(
|
||||
tenantid: ZTenantId,
|
||||
timelineid: ZTimelineId,
|
||||
) -> Result<Arc<DatadirTimelineImpl>> {
|
||||
let mut m = access_tenants();
|
||||
let tenant = m
|
||||
.get_mut(&tenant_id)
|
||||
.with_context(|| format!("Tenant {tenant_id} not found"))?;
|
||||
.get_mut(&tenantid)
|
||||
.with_context(|| format!("Tenant {} not found", tenantid))?;
|
||||
|
||||
if let Some(page_tline) = tenant.local_timelines.get(&timeline_id) {
|
||||
if let Some(page_tline) = tenant.timelines.get(&timelineid) {
|
||||
return Ok(Arc::clone(page_tline));
|
||||
}
|
||||
// First access to this timeline. Create a DatadirTimeline wrapper for it
|
||||
let tline = tenant
|
||||
.repo
|
||||
.get_timeline_load(timelineid)
|
||||
.with_context(|| format!("Timeline {} not found for tenant {}", timelineid, tenantid))?;
|
||||
|
||||
let page_tline = new_local_timeline(&tenant.repo, timeline_id)
|
||||
.with_context(|| format!("Failed to create new local timeline for tenant {tenant_id}"))?;
|
||||
tenant
|
||||
.local_timelines
|
||||
.insert(timeline_id, Arc::clone(&page_tline));
|
||||
Ok(page_tline)
|
||||
}
|
||||
let repartition_distance = tenant.repo.conf.checkpoint_distance / 10;
|
||||
|
||||
pub fn detach_timeline(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// shutdown the timeline threads (this shuts down the walreceiver)
|
||||
thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
|
||||
|
||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant
|
||||
.repo
|
||||
.detach_timeline(timeline_id)
|
||||
.context("Failed to detach inmem tenant timeline")?;
|
||||
tenant.local_timelines.remove(&timeline_id);
|
||||
}
|
||||
None => bail!("Tenant {tenant_id} not found in local tenant state"),
|
||||
}
|
||||
|
||||
let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local timeline directory '{}'",
|
||||
local_timeline_directory.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn new_local_timeline(
|
||||
repo: &RepositoryImpl,
|
||||
timeline_id: ZTimelineId,
|
||||
) -> anyhow::Result<Arc<DatadirTimeline<LayeredRepository>>> {
|
||||
let inmem_timeline = repo.get_timeline_load(timeline_id).with_context(|| {
|
||||
format!("Inmem timeline {timeline_id} not found in tenant's repository")
|
||||
})?;
|
||||
let repartition_distance = repo.get_checkpoint_distance() / 10;
|
||||
let page_tline = Arc::new(DatadirTimelineImpl::new(
|
||||
inmem_timeline,
|
||||
repartition_distance,
|
||||
));
|
||||
let page_tline = Arc::new(DatadirTimelineImpl::new(tline, repartition_distance));
|
||||
page_tline.init_logical_size()?;
|
||||
tenant.timelines.insert(timelineid, Arc::clone(&page_tline));
|
||||
Ok(page_tline)
|
||||
}
|
||||
|
||||
@@ -359,122 +306,15 @@ pub struct TenantInfo {
|
||||
pub state: TenantState,
|
||||
}
|
||||
|
||||
pub fn list_tenants() -> Vec<TenantInfo> {
|
||||
tenants_state::read_tenants()
|
||||
pub fn list_tenants() -> Result<Vec<TenantInfo>> {
|
||||
access_tenants()
|
||||
.iter()
|
||||
.map(|(id, tenant)| TenantInfo {
|
||||
id: *id,
|
||||
state: tenant.state,
|
||||
.map(|v| {
|
||||
let (id, tenant) = v;
|
||||
Ok(TenantInfo {
|
||||
id: *id,
|
||||
state: tenant.state,
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn init_local_repositories(
|
||||
conf: &'static PageServerConf,
|
||||
local_timeline_init_statuses: HashMap<ZTenantId, HashMap<ZTimelineId, LocalTimelineInitStatus>>,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> anyhow::Result<(), anyhow::Error> {
|
||||
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
|
||||
// initialize local tenant
|
||||
let repo = load_local_repo(conf, tenant_id, remote_index)
|
||||
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
|
||||
|
||||
let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len());
|
||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||
match init_status {
|
||||
LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
||||
}
|
||||
LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
"timeline {tenant_id} for tenant {timeline_id} needs sync, \
|
||||
so skipped for adding into repository until sync is finished"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Lets fail here loudly to be on the safe side.
|
||||
// XXX: It may be a better api to actually distinguish between repository startup
|
||||
// and processing of newly downloaded timelines.
|
||||
apply_timeline_remote_sync_status_updates(&repo, status_updates)
|
||||
.with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn apply_timeline_remote_sync_status_updates(
|
||||
repo: &LayeredRepository,
|
||||
status_updates: HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut registration_queue = Vec::with_capacity(status_updates.len());
|
||||
|
||||
// first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
|
||||
for (timeline_id, status_update) in status_updates {
|
||||
repo.apply_timeline_remote_sync_status_update(timeline_id, status_update)
|
||||
.with_context(|| {
|
||||
format!("Failed to load timeline {timeline_id} into in-memory repository")
|
||||
})?;
|
||||
match status_update {
|
||||
TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id),
|
||||
}
|
||||
}
|
||||
|
||||
for timeline_id in registration_queue {
|
||||
let tenant_id = repo.tenant_id();
|
||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||
Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
|
||||
Entry::Occupied(_) => {
|
||||
bail!("Local timeline {timeline_id} already registered")
|
||||
}
|
||||
Entry::Vacant(v) => {
|
||||
v.insert(new_local_timeline(repo, timeline_id).with_context(|| {
|
||||
format!("Failed to register new local timeline for tenant {tenant_id}")
|
||||
})?);
|
||||
}
|
||||
},
|
||||
None => bail!(
|
||||
"Tenant {} not found in local tenant state",
|
||||
repo.tenant_id()
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Sets up wal redo manager and repository for tenant. Reduces code duplication.
|
||||
// Used during pageserver startup, or when new tenant is attached to pageserver.
|
||||
fn load_local_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> anyhow::Result<Arc<RepositoryImpl>> {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
let tenant = m.entry(tenant_id).or_insert_with(|| {
|
||||
// Set up a WAL redo manager, for applying WAL records.
|
||||
let walredo_mgr = PostgresRedoManager::new(conf, tenant_id);
|
||||
|
||||
// Set up an object repository, for actual data storage.
|
||||
let repo: Arc<LayeredRepository> = Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
TenantConfOpt::default(),
|
||||
Arc::new(walredo_mgr),
|
||||
tenant_id,
|
||||
remote_index.clone(),
|
||||
conf.remote_storage_config.is_some(),
|
||||
));
|
||||
Tenant {
|
||||
state: TenantState::Idle,
|
||||
repo,
|
||||
local_timelines: HashMap::new(),
|
||||
}
|
||||
});
|
||||
|
||||
// Restore tenant config
|
||||
let tenant_conf = LayeredRepository::load_tenant_config(conf, tenant_id)?;
|
||||
tenant.repo.update_tenant_config(tenant_conf)?;
|
||||
|
||||
Ok(Arc::clone(&tenant.repo))
|
||||
}
|
||||
|
||||
@@ -1,18 +1,19 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as compaction and GC
|
||||
use crate::config::PageServerConf;
|
||||
use crate::repository::Repository;
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use anyhow::Result;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
///
|
||||
/// Compaction thread's main loop
|
||||
///
|
||||
pub fn compact_loop(tenantid: ZTenantId) -> Result<()> {
|
||||
if let Err(err) = compact_loop_ext(tenantid) {
|
||||
pub fn compact_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
if let Err(err) = compact_loop_ext(tenantid, conf) {
|
||||
error!("compact loop terminated with error: {:?}", err);
|
||||
Err(err)
|
||||
} else {
|
||||
@@ -20,15 +21,13 @@ pub fn compact_loop(tenantid: ZTenantId) -> Result<()> {
|
||||
}
|
||||
}
|
||||
|
||||
fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> {
|
||||
fn compact_loop_ext(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
}
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let compaction_period = repo.get_compaction_period();
|
||||
|
||||
std::thread::sleep(compaction_period);
|
||||
std::thread::sleep(conf.compaction_period);
|
||||
trace!("compaction thread for tenant {} waking up", tenantid);
|
||||
|
||||
// Compact timelines
|
||||
@@ -47,23 +46,23 @@ fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> {
|
||||
///
|
||||
/// GC thread's main loop
|
||||
///
|
||||
pub fn gc_loop(tenantid: ZTenantId) -> Result<()> {
|
||||
pub fn gc_loop(tenantid: ZTenantId, conf: &'static PageServerConf) -> Result<()> {
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
}
|
||||
|
||||
trace!("gc thread for tenant {} waking up", tenantid);
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let gc_horizon = repo.get_gc_horizon();
|
||||
|
||||
// Garbage collect old files that are not needed for PITR anymore
|
||||
if gc_horizon > 0 {
|
||||
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
|
||||
if conf.gc_horizon > 0 {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
repo.gc_iteration(None, conf.gc_horizon, false)?;
|
||||
}
|
||||
|
||||
// TODO Write it in more adequate way using
|
||||
// condvar.wait_timeout() or something
|
||||
let mut sleep_time = repo.get_gc_period().as_secs();
|
||||
let mut sleep_time = conf.gc_period.as_secs();
|
||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
|
||||
{
|
||||
sleep_time -= 1;
|
||||
|
||||
@@ -47,7 +47,7 @@ use tracing::{debug, error, info, warn};
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::shutdown_pageserver;
|
||||
|
||||
@@ -130,14 +130,12 @@ struct PageServerThread {
|
||||
}
|
||||
|
||||
/// Launch a new thread
|
||||
/// Note: if shutdown_process_on_error is set to true failure
|
||||
/// of the thread will lead to shutdown of entire process
|
||||
pub fn spawn<F>(
|
||||
kind: ThreadKind,
|
||||
tenant_id: Option<ZTenantId>,
|
||||
timeline_id: Option<ZTimelineId>,
|
||||
name: &str,
|
||||
shutdown_process_on_error: bool,
|
||||
fail_on_error: bool,
|
||||
f: F,
|
||||
) -> std::io::Result<()>
|
||||
where
|
||||
@@ -177,7 +175,7 @@ where
|
||||
thread_id,
|
||||
thread_rc2,
|
||||
shutdown_rx,
|
||||
shutdown_process_on_error,
|
||||
fail_on_error,
|
||||
f,
|
||||
)
|
||||
}) {
|
||||
@@ -203,7 +201,7 @@ fn thread_wrapper<F>(
|
||||
thread_id: u64,
|
||||
thread: Arc<PageServerThread>,
|
||||
shutdown_rx: watch::Receiver<()>,
|
||||
shutdown_process_on_error: bool,
|
||||
fail_on_error: bool,
|
||||
f: F,
|
||||
) where
|
||||
F: FnOnce() -> anyhow::Result<()> + Send + 'static,
|
||||
@@ -223,41 +221,27 @@ fn thread_wrapper<F>(
|
||||
let result = panic::catch_unwind(AssertUnwindSafe(f));
|
||||
|
||||
// Remove our entry from the global hashmap.
|
||||
let thread = THREADS
|
||||
.lock()
|
||||
.unwrap()
|
||||
.remove(&thread_id)
|
||||
.expect("no thread in registry");
|
||||
THREADS.lock().unwrap().remove(&thread_id);
|
||||
|
||||
match result {
|
||||
Ok(Ok(())) => debug!("Thread '{}' exited normally", thread_name),
|
||||
Ok(Err(err)) => {
|
||||
if shutdown_process_on_error {
|
||||
if fail_on_error {
|
||||
error!(
|
||||
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
||||
"Shutting down: thread '{}' exited with error: {:?}",
|
||||
thread_name, err
|
||||
);
|
||||
shutdown_pageserver(1);
|
||||
shutdown_pageserver();
|
||||
} else {
|
||||
error!(
|
||||
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
|
||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
||||
);
|
||||
error!("Thread '{}' exited with error: {:?}", thread_name, err);
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
if shutdown_process_on_error {
|
||||
error!(
|
||||
"Shutting down: thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
||||
);
|
||||
shutdown_pageserver(1);
|
||||
} else {
|
||||
error!(
|
||||
"Thread '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
|
||||
thread_name, thread.tenant_id, thread.timeline_id, err
|
||||
);
|
||||
}
|
||||
error!(
|
||||
"Shutting down: thread '{}' panicked: {:?}",
|
||||
thread_name, err
|
||||
);
|
||||
shutdown_pageserver();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//! Timeline management code
|
||||
//
|
||||
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use anyhow::{bail, Context, Result};
|
||||
use postgres_ffi::ControlFileData;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
@@ -14,18 +14,15 @@ use std::{
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
crashsafe_dir, logging,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
use zenith_utils::{crashsafe_dir, logging};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
layered_repository::metadata::TimelineMetadata,
|
||||
remote_storage::RemoteIndex,
|
||||
repository::{LocalTimelineState, Repository},
|
||||
tenant_config::TenantConfOpt,
|
||||
DatadirTimeline, RepositoryImpl,
|
||||
};
|
||||
use crate::{import_datadir, LOG_FILE_NAME};
|
||||
@@ -106,7 +103,7 @@ impl LocalTimelineInfo {
|
||||
match repo_timeline {
|
||||
RepositoryTimeline::Loaded(_) => {
|
||||
let datadir_tline =
|
||||
tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id)?;
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id)?;
|
||||
Self::from_loaded_timeline(&datadir_tline, include_non_incremental_logical_size)
|
||||
}
|
||||
RepositoryTimeline::Unloaded { metadata } => Ok(Self::from_unloaded_timeline(metadata)),
|
||||
@@ -117,8 +114,8 @@ impl LocalTimelineInfo {
|
||||
#[serde_as]
|
||||
#[derive(Debug, Serialize, Deserialize, Clone)]
|
||||
pub struct RemoteTimelineInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
pub awaits_download: bool,
|
||||
}
|
||||
|
||||
@@ -152,8 +149,8 @@ pub fn init_pageserver(
|
||||
|
||||
if let Some(tenant_id) = create_tenant {
|
||||
println!("initializing tenantid {}", tenant_id);
|
||||
let repo = create_repo(conf, TenantConfOpt::default(), tenant_id, CreateRepo::Dummy)
|
||||
.context("failed to create repo")?;
|
||||
let repo =
|
||||
create_repo(conf, tenant_id, CreateRepo::Dummy).context("failed to create repo")?;
|
||||
let new_timeline_id = initial_timeline_id.unwrap_or_else(ZTimelineId::generate);
|
||||
bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())
|
||||
.context("failed to create initial timeline")?;
|
||||
@@ -176,7 +173,6 @@ pub enum CreateRepo {
|
||||
|
||||
pub fn create_repo(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_conf: TenantConfOpt,
|
||||
tenant_id: ZTenantId,
|
||||
create_repo: CreateRepo,
|
||||
) -> Result<Arc<RepositoryImpl>> {
|
||||
@@ -203,11 +199,9 @@ pub fn create_repo(
|
||||
};
|
||||
|
||||
let repo_dir = conf.tenant_path(&tenant_id);
|
||||
ensure!(
|
||||
!repo_dir.exists(),
|
||||
"cannot create new tenant repo: '{}' directory already exists",
|
||||
tenant_id
|
||||
);
|
||||
if repo_dir.exists() {
|
||||
bail!("tenant {} directory already exists", tenant_id);
|
||||
}
|
||||
|
||||
// top-level dir may exist if we are creating it through CLI
|
||||
crashsafe_dir::create_dir_all(&repo_dir)
|
||||
@@ -215,12 +209,8 @@ pub fn create_repo(
|
||||
crashsafe_dir::create_dir(conf.timelines_path(&tenant_id))?;
|
||||
info!("created directory structure in {}", repo_dir.display());
|
||||
|
||||
// Save tenant's config
|
||||
LayeredRepository::persist_tenant_config(conf, tenant_id, tenant_conf)?;
|
||||
|
||||
Ok(Arc::new(LayeredRepository::new(
|
||||
conf,
|
||||
tenant_conf,
|
||||
wal_redo_manager,
|
||||
tenant_id,
|
||||
remote_index,
|
||||
@@ -385,7 +375,7 @@ pub(crate) fn create_timeline(
|
||||
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
|
||||
// load the timeline into memory
|
||||
let loaded_timeline =
|
||||
tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
|
||||
LocalTimelineInfo::from_loaded_timeline(&loaded_timeline, false)
|
||||
.context("cannot fill timeline info")?
|
||||
}
|
||||
@@ -393,7 +383,7 @@ pub(crate) fn create_timeline(
|
||||
bootstrap_timeline(conf, tenant_id, new_timeline_id, repo.as_ref())?;
|
||||
// load the timeline into memory
|
||||
let new_timeline =
|
||||
tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, new_timeline_id)?;
|
||||
LocalTimelineInfo::from_loaded_timeline(&new_timeline, false)
|
||||
.context("cannot fill timeline info")?
|
||||
}
|
||||
|
||||
@@ -11,15 +11,15 @@
|
||||
//! src/backend/storage/file/fd.c
|
||||
//!
|
||||
use lazy_static::lazy_static;
|
||||
use once_cell::sync::OnceCell;
|
||||
use std::fs::{File, OpenOptions};
|
||||
use std::io::{Error, ErrorKind, Read, Seek, SeekFrom, Write};
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::{RwLock, RwLockWriteGuard};
|
||||
use zenith_metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec};
|
||||
|
||||
use metrics::{register_histogram_vec, register_int_gauge_vec, HistogramVec, IntGaugeVec};
|
||||
use once_cell::sync::OnceCell;
|
||||
|
||||
// Metrics collected on disk IO operations
|
||||
const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
|
||||
|
||||
@@ -38,7 +38,7 @@ use postgres_ffi::nonrelfile_utils::mx_offset_to_member_segment;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::TransactionId;
|
||||
use postgres_ffi::{pg_constants, CheckPoint};
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
@@ -635,10 +635,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
|
||||
segno,
|
||||
rpageno,
|
||||
if is_commit {
|
||||
ZenithWalRecord::ClogSetCommitted {
|
||||
xids: page_xids,
|
||||
timestamp: parsed.xact_time,
|
||||
}
|
||||
ZenithWalRecord::ClogSetCommitted { xids: page_xids }
|
||||
} else {
|
||||
ZenithWalRecord::ClogSetAborted { xids: page_xids }
|
||||
},
|
||||
@@ -655,10 +652,7 @@ impl<'a, R: Repository> WalIngest<'a, R> {
|
||||
segno,
|
||||
rpageno,
|
||||
if is_commit {
|
||||
ZenithWalRecord::ClogSetCommitted {
|
||||
xids: page_xids,
|
||||
timestamp: parsed.xact_time,
|
||||
}
|
||||
ZenithWalRecord::ClogSetCommitted { xids: page_xids }
|
||||
} else {
|
||||
ZenithWalRecord::ClogSetAborted { xids: page_xids }
|
||||
},
|
||||
|
||||
@@ -29,11 +29,11 @@ use tokio_postgres::replication::ReplicationStream;
|
||||
use tokio_postgres::{Client, NoTls, SimpleQueryMessage, SimpleQueryRow};
|
||||
use tokio_stream::StreamExt;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
pq_proto::ZenithFeedback,
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::pq_proto::ZenithFeedback;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTenantTimelineId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
//
|
||||
// We keep one WAL Receiver active per timeline.
|
||||
@@ -93,7 +93,7 @@ pub fn launch_wal_receiver(
|
||||
receivers.insert((tenantid, timelineid), receiver);
|
||||
|
||||
// Update tenant state and start tenant threads, if they are not running yet.
|
||||
tenant_mgr::activate_tenant(tenantid)?;
|
||||
tenant_mgr::activate_tenant(conf, tenantid)?;
|
||||
}
|
||||
};
|
||||
Ok(())
|
||||
@@ -184,7 +184,7 @@ fn walreceiver_main(
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)
|
||||
.with_context(|| format!("no repository found for tenant {}", tenant_id))?;
|
||||
let timeline =
|
||||
tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).with_context(|| {
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).with_context(|| {
|
||||
format!(
|
||||
"local timeline {} not found for tenant {}",
|
||||
timeline_id, tenant_id
|
||||
@@ -305,7 +305,7 @@ fn walreceiver_main(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_timeline| remote_timeline.metadata.disk_consistent_lsn())
|
||||
.and_then(|e| e.disk_consistent_lsn())
|
||||
.unwrap_or(Lsn(0)) // no checkpoint was uploaded
|
||||
});
|
||||
|
||||
|
||||
@@ -24,10 +24,7 @@ pub enum ZenithWalRecord {
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
ClogSetCommitted {
|
||||
xids: Vec<TransactionId>,
|
||||
timestamp: TimestampTz,
|
||||
},
|
||||
ClogSetCommitted { xids: Vec<TransactionId> },
|
||||
/// Mark transaction IDs as aborted on a CLOG page
|
||||
ClogSetAborted { xids: Vec<TransactionId> },
|
||||
/// Extend multixact offsets SLRU
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use log::*;
|
||||
use nix::poll::*;
|
||||
use serde::Serialize;
|
||||
use std::fs;
|
||||
@@ -34,15 +35,17 @@ use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
|
||||
use std::sync::Mutex;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock, zid::ZTenantId};
|
||||
use zenith_metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::nonblock::set_nonblock;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::Key;
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use metrics::{register_histogram, register_int_counter, Histogram, IntCounter};
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_bitshift;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_flags_offset;
|
||||
use postgres_ffi::nonrelfile_utils::mx_offset_to_member_offset;
|
||||
@@ -283,11 +286,6 @@ impl PostgresRedoManager {
|
||||
// If something went wrong, don't try to reuse the process. Kill it, and
|
||||
// next request will launch a new one.
|
||||
if result.is_err() {
|
||||
error!(
|
||||
"error applying {} WAL records to reconstruct page image at LSN {}",
|
||||
records.len(),
|
||||
lsn
|
||||
);
|
||||
let process = process_guard.take().unwrap();
|
||||
process.kill();
|
||||
}
|
||||
@@ -392,7 +390,7 @@ impl PostgresRedoManager {
|
||||
}
|
||||
// Non-relational WAL records are handled here, with custom code that has the
|
||||
// same effects as the corresponding Postgres WAL redo function.
|
||||
ZenithWalRecord::ClogSetCommitted { xids, timestamp } => {
|
||||
ZenithWalRecord::ClogSetCommitted { xids } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
key_to_slru_block(key).or(Err(WalRedoError::InvalidRecord))?;
|
||||
assert_eq!(
|
||||
@@ -426,21 +424,6 @@ impl PostgresRedoManager {
|
||||
page,
|
||||
);
|
||||
}
|
||||
|
||||
// Append the timestamp
|
||||
if page.len() == pg_constants::BLCKSZ as usize + 8 {
|
||||
page.truncate(pg_constants::BLCKSZ as usize);
|
||||
}
|
||||
if page.len() == pg_constants::BLCKSZ as usize {
|
||||
page.extend_from_slice(×tamp.to_be_bytes());
|
||||
} else {
|
||||
warn!(
|
||||
"CLOG blk {} in seg {} has invalid size {}",
|
||||
blknum,
|
||||
segno,
|
||||
page.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
ZenithWalRecord::ClogSetAborted { xids } => {
|
||||
let (slru_kind, segno, blknum) =
|
||||
@@ -720,12 +703,7 @@ impl PostgresRedoProcess {
|
||||
// If we have more data to write, wake up if 'stdin' becomes writeable or
|
||||
// we have data to read. Otherwise only wake up if there's data to read.
|
||||
let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
|
||||
let n = loop {
|
||||
match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) {
|
||||
Err(e) if e == nix::errno::Errno::EINTR => continue,
|
||||
res => break res,
|
||||
}
|
||||
}?;
|
||||
let n = nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32)?;
|
||||
|
||||
if n == 0 {
|
||||
return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
|
||||
|
||||
@@ -17,8 +17,8 @@ log = "0.4.14"
|
||||
memoffset = "0.6.2"
|
||||
thiserror = "1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.59.1"
|
||||
@@ -88,8 +88,8 @@ fn main() {
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
//
|
||||
.clang_arg("-I../../tmp_install/include/server")
|
||||
.clang_arg("-I../../tmp_install/include/postgresql/server")
|
||||
.clang_arg("-I../tmp_install/include/server")
|
||||
.clang_arg("-I../tmp_install/include/postgresql/server")
|
||||
//
|
||||
// Finish the builder and generate the bindings.
|
||||
//
|
||||
@@ -43,7 +43,7 @@ impl ControlFileData {
|
||||
/// Interpret a slice of bytes as a Postgres control file.
|
||||
///
|
||||
pub fn decode(buf: &[u8]) -> Result<ControlFileData> {
|
||||
use utils::bin_ser::LeSer;
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
|
||||
// Check that the slice has the expected size. The control file is
|
||||
// padded with zeros up to a 512 byte sector size, so accept a
|
||||
@@ -77,7 +77,7 @@ impl ControlFileData {
|
||||
///
|
||||
/// The CRC is recomputed to match the contents of the fields.
|
||||
pub fn encode(&self) -> Bytes {
|
||||
use utils::bin_ser::LeSer;
|
||||
use zenith_utils::bin_ser::LeSer;
|
||||
|
||||
// Serialize into a new buffer.
|
||||
let b = self.ser().unwrap();
|
||||
@@ -4,7 +4,7 @@
|
||||
//! This understands the WAL page and record format, enough to figure out where the WAL record
|
||||
//! boundaries are, and to reassemble WAL records that cross page boundaries.
|
||||
//!
|
||||
//! This functionality is needed by both the pageserver and the safekeepers. The pageserver needs
|
||||
//! This functionality is needed by both the pageserver and the walkeepers. The pageserver needs
|
||||
//! to look deeper into the WAL records to also understand which blocks they modify, the code
|
||||
//! for that is in pageserver/src/walrecord.rs
|
||||
//!
|
||||
@@ -18,7 +18,7 @@ use crc32c::*;
|
||||
use log::*;
|
||||
use std::cmp::min;
|
||||
use thiserror::Error;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub struct WalStreamDecoder {
|
||||
lsn: Lsn,
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user