mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 10:52:55 +00:00
Compare commits
1 Commits
relsize_ca
...
sort-locks
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
44feda0061 |
@@ -6,7 +6,5 @@ timeout = 30
|
||||
|
||||
[ssh_connection]
|
||||
ssh_args = -F ./ansible.ssh.cfg
|
||||
# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
|
||||
# and scp neither worked for me
|
||||
transfer_method = piped
|
||||
scp_if_ssh = True
|
||||
pipelining = True
|
||||
@@ -1,7 +1,3 @@
|
||||
# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
|
||||
# (use pre 8.5 option name to cope with old ssh in CI)
|
||||
PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
|
||||
|
||||
Host tele.zenith.tech
|
||||
User admin
|
||||
Port 3023
|
||||
@@ -12,7 +12,6 @@ pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
env_name = neon-stress
|
||||
console_mgmt_base_url = http://neon-stress-console.local
|
||||
bucket_name = neon-storage-ireland
|
||||
bucket_region = eu-west-1
|
||||
@@ -1,7 +1,6 @@
|
||||
[pageservers]
|
||||
#zenith-1-ps-1 console_region_id=1
|
||||
zenith-1-ps-2 console_region_id=1
|
||||
zenith-1-ps-3 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
zenith-1-sk-1 console_region_id=1
|
||||
@@ -13,7 +12,6 @@ pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
env_name = prod-1
|
||||
console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
@@ -13,7 +13,6 @@ pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
env_name = us-stage
|
||||
console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
@@ -5,10 +5,10 @@ executors:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
|
||||
- image: neondatabase/rust:1.58
|
||||
- image: zimg/rust:1.58
|
||||
neon-executor:
|
||||
docker:
|
||||
- image: neondatabase/rust:1.58
|
||||
- image: zimg/rust:1.58
|
||||
|
||||
jobs:
|
||||
# A job to build postgres
|
||||
@@ -37,7 +37,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
# Build postgres if the restore_cache didn't find a build.
|
||||
# `make` can't figure out whether the cache is valid, since
|
||||
@@ -54,7 +54,7 @@ jobs:
|
||||
|
||||
- save_cache:
|
||||
name: Save postgres cache
|
||||
key: v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
name: Restore postgres cache
|
||||
keys:
|
||||
# Restore ONLY if the rev key matches exactly
|
||||
- v05-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
|
||||
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
@@ -93,29 +93,31 @@ jobs:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
- v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
|
||||
# Build the rust code, including test binaries
|
||||
- run:
|
||||
name: Rust build << parameters.build_type >>
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS="--release --features profiling"
|
||||
fi
|
||||
|
||||
export CARGO_INCREMENTAL=0
|
||||
export CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
export RUSTC_WRAPPER=""
|
||||
export RUSTC_WRAPPER=cachepot
|
||||
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
||||
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
||||
mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
cachepot -s
|
||||
|
||||
- save_cache:
|
||||
name: Save rust cache
|
||||
key: v05-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
|
||||
paths:
|
||||
- ~/.cargo/registry
|
||||
- ~/.cargo/git
|
||||
@@ -126,22 +128,35 @@ jobs:
|
||||
name: cargo test
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
cargo test $CARGO_FLAGS
|
||||
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
|
||||
|
||||
# Install the rust binaries, for use by test jobs
|
||||
- run:
|
||||
name: Install rust binaries
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
binaries=$(
|
||||
cargo metadata --format-version=1 --no-deps |
|
||||
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
|
||||
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
|
||||
)
|
||||
|
||||
test_exe_paths=$(
|
||||
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
|
||||
mkdir -p /tmp/zenith/bin
|
||||
mkdir -p /tmp/zenith/test_bin
|
||||
mkdir -p /tmp/zenith/etc
|
||||
@@ -151,15 +166,34 @@ jobs:
|
||||
SRC=target/$BUILD_TYPE/$bin
|
||||
DST=/tmp/zenith/bin/$bin
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
||||
done
|
||||
|
||||
# Install test executables (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
for bin in $test_exe_paths; do
|
||||
SRC=$bin
|
||||
DST=/tmp/zenith/test_bin/$(basename $bin)
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/zenith/etc/binaries.list
|
||||
done
|
||||
fi
|
||||
|
||||
# Install the postgres binaries, for use by test jobs
|
||||
- run:
|
||||
name: Install postgres binaries
|
||||
command: |
|
||||
cp -a tmp_install /tmp/zenith/pg_install
|
||||
|
||||
# Save rust binaries for other jobs in the workflow
|
||||
- run:
|
||||
name: Merge coverage data
|
||||
command: |
|
||||
# This will speed up workspace uploads
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
|
||||
fi
|
||||
|
||||
# Save the rust binaries and coverage data for other jobs in this workflow.
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
@@ -252,7 +286,7 @@ jobs:
|
||||
# no_output_timeout, specified here.
|
||||
no_output_timeout: 10m
|
||||
environment:
|
||||
- NEON_BIN: /tmp/zenith/bin
|
||||
- ZENITH_BIN: /tmp/zenith/bin
|
||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||
- TEST_OUTPUT: /tmp/test_output
|
||||
# this variable will be embedded in perf test report
|
||||
@@ -280,6 +314,12 @@ jobs:
|
||||
|
||||
export GITHUB_SHA=$CIRCLE_SHA1
|
||||
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
# Run the tests.
|
||||
#
|
||||
# The junit.xml file allows CircleCI to display more fine-grained test information
|
||||
@@ -290,7 +330,7 @@ jobs:
|
||||
# -n4 uses four processes to run tests via pytest-xdist
|
||||
# -s is not used to prevent pytest from capturing output, because tests are running
|
||||
# in parallel and logs are mixed between different tests
|
||||
./scripts/pytest \
|
||||
"${cov_prefix[@]}" ./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
@@ -319,12 +359,379 @@ jobs:
|
||||
# The store_test_results step tells CircleCI where to find the junit.xml file.
|
||||
- store_test_results:
|
||||
path: /tmp/test_output
|
||||
# Save data (if any)
|
||||
- run:
|
||||
name: Merge coverage data
|
||||
command: |
|
||||
# This will speed up workspace uploads
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
|
||||
fi
|
||||
# Save coverage data (if any)
|
||||
- persist_to_workspace:
|
||||
root: /tmp/zenith
|
||||
paths:
|
||||
- "*"
|
||||
|
||||
coverage-report:
|
||||
executor: neon-xlarge-executor
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
- checkout
|
||||
- restore_cache:
|
||||
name: Restore rust cache
|
||||
keys:
|
||||
# Require an exact match. While an out of date cache might speed up the build,
|
||||
# there's no way to clean out old packages, so the cache grows every time something
|
||||
# changes.
|
||||
- v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
|
||||
- run:
|
||||
name: Build coverage report
|
||||
command: |
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/coverage \
|
||||
--dir=/tmp/zenith/coverage report \
|
||||
--input-objects=/tmp/zenith/etc/binaries.list \
|
||||
--commit-url=$COMMIT_URL \
|
||||
--format=github
|
||||
- run:
|
||||
name: Upload coverage report
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
||||
|
||||
# Add link to the coverage report to the commit
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "$CI_ACCESS_TOKEN" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"success\",
|
||||
\"context\": \"zenith-coverage\",
|
||||
\"description\": \"Coverage report is ready\",
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
# Build neondatabase/neon:latest image and push it to Docker hub
|
||||
docker-image:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:latest
|
||||
|
||||
# Build neondatabase/compute-node:latest image and push it to Docker hub
|
||||
docker-image-compute:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:local \
|
||||
--tag neondatabase/compute-tools:latest \
|
||||
-f Dockerfile.compute-tools .
|
||||
# Only push :latest image
|
||||
docker push neondatabase/compute-tools:latest
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
|
||||
--tag neondatabase/compute-node:latest vendor/postgres \
|
||||
--build-arg COMPUTE_TOOLS_TAG=local
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:latest
|
||||
|
||||
# Build production neondatabase/neon:release image and push it to Docker hub
|
||||
docker-image-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:release
|
||||
|
||||
# Build production neondatabase/compute-node:release image and push it to Docker hub
|
||||
docker-image-compute-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:release \
|
||||
--tag neondatabase/compute-tools:local \
|
||||
-f Dockerfile.compute-tools .
|
||||
# Only push :release image
|
||||
docker push neondatabase/compute-tools:release
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
|
||||
--tag neondatabase/compute-node:release vendor/postgres \
|
||||
--build-arg COMPUTE_TOOLS_TAG=local
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:release
|
||||
|
||||
deploy-staging:
|
||||
docker:
|
||||
- image: cimg/python:3.10
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker
|
||||
- run:
|
||||
name: Setup ansible
|
||||
command: |
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
- run:
|
||||
name: Redeploy
|
||||
command: |
|
||||
cd "$(pwd)/.circleci/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i staging.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-staging-proxy:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
environment:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Store kubeconfig file
|
||||
command: |
|
||||
echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
- run:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-neon-stress:
|
||||
docker:
|
||||
- image: cimg/python:3.10
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker
|
||||
- run:
|
||||
name: Setup ansible
|
||||
command: |
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
- run:
|
||||
name: Redeploy
|
||||
command: |
|
||||
cd "$(pwd)/.circleci/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i neon-stress.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-neon-stress-proxy:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
environment:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Store kubeconfig file
|
||||
command: |
|
||||
echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
- run:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-release:
|
||||
docker:
|
||||
- image: cimg/python:3.10
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker
|
||||
- run:
|
||||
name: Setup ansible
|
||||
command: |
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
- run:
|
||||
name: Redeploy
|
||||
command: |
|
||||
cd "$(pwd)/.circleci/ansible"
|
||||
|
||||
RELEASE=true ./get_binaries.sh
|
||||
|
||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i production.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-release-proxy:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
environment:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Store kubeconfig file
|
||||
command: |
|
||||
echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
- run:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
# Trigger a new remote CI job
|
||||
remote-ci-trigger:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
parameters:
|
||||
remote_repo:
|
||||
type: string
|
||||
environment:
|
||||
REMOTE_REPO: << parameters.remote_repo >>
|
||||
steps:
|
||||
- run:
|
||||
name: Set PR's status to pending
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "$CI_ACCESS_TOKEN" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
- run:
|
||||
name: Request a remote CI test
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "$CI_ACCESS_TOKEN" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
||||
\"remote_repo\": \"$LOCAL_REPO\"
|
||||
}
|
||||
}"
|
||||
|
||||
workflows:
|
||||
build_and_test:
|
||||
jobs:
|
||||
@@ -367,3 +774,120 @@ workflows:
|
||||
save_perf_report: true
|
||||
requires:
|
||||
- build-neon-release
|
||||
- coverage-report:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
requires:
|
||||
# TODO: consider adding more
|
||||
- other-tests-debug
|
||||
- docker-image:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# Build image only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- pg_regress-tests-release
|
||||
- other-tests-release
|
||||
- docker-image-compute:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# Build image only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- pg_regress-tests-release
|
||||
- other-tests-release
|
||||
- deploy-staging:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
- deploy-staging-proxy:
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- deploy-neon-stress:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
- deploy-neon-stress-proxy:
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- docker-image-release:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# Build image only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- release
|
||||
requires:
|
||||
- pg_regress-tests-release
|
||||
- other-tests-release
|
||||
- docker-image-compute-release:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# Build image only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- release
|
||||
requires:
|
||||
- pg_regress-tests-release
|
||||
- other-tests-release
|
||||
- deploy-release:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- release
|
||||
requires:
|
||||
- docker-image-release
|
||||
- deploy-release-proxy:
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- release
|
||||
requires:
|
||||
- docker-image-release
|
||||
- remote-ci-trigger:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
remote_repo: "neondatabase/cloud"
|
||||
requires:
|
||||
# XXX: Successful build doesn't mean everything is OK, but
|
||||
# the job to be triggered takes so much time to complete (~22 min)
|
||||
# that it's better not to wait for the commented-out steps
|
||||
- build-neon-release
|
||||
# - pg_regress-tests-release
|
||||
# - other-tests-release
|
||||
|
||||
35
.github/actions/run-python-test-set/action.yml
vendored
35
.github/actions/run-python-test-set/action.yml
vendored
@@ -2,29 +2,25 @@ name: 'Run python test'
|
||||
description: 'Runs a Neon python test set, performing all the required preparations before'
|
||||
|
||||
inputs:
|
||||
# Select the type of Rust build. Must be "release" or "debug".
|
||||
build_type:
|
||||
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
|
||||
required: true
|
||||
rust_toolchain:
|
||||
description: 'Rust toolchain version to fetch the caches'
|
||||
required: true
|
||||
# This parameter is required, to prevent the mistake of running all tests in one job.
|
||||
test_selection:
|
||||
description: 'A python test suite to run'
|
||||
required: true
|
||||
# Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
|
||||
extra_params:
|
||||
description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
|
||||
required: false
|
||||
default: ''
|
||||
needs_postgres_source:
|
||||
description: 'Set to true if the test suite requires postgres source checked out'
|
||||
required: false
|
||||
default: 'false'
|
||||
run_in_parallel:
|
||||
description: 'Whether to run tests in parallel'
|
||||
required: false
|
||||
default: 'true'
|
||||
save_perf_report:
|
||||
description: 'Whether to upload the performance report'
|
||||
required: false
|
||||
default: 'false'
|
||||
|
||||
@@ -64,7 +60,7 @@ runs:
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
NEON_BIN: /tmp/neon/bin
|
||||
ZENITH_BIN: /tmp/neon/bin
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
# this variable will be embedded in perf test report
|
||||
@@ -85,14 +81,14 @@ runs:
|
||||
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
|
||||
fi
|
||||
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
|
||||
if [[ "$GITHUB_REF" == "main" ]]; then
|
||||
mkdir -p "$PERF_REPORT_DIR"
|
||||
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
|
||||
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
@@ -115,26 +111,9 @@ runs:
|
||||
-rA $TEST_SELECTION $EXTRA_PARAMS
|
||||
|
||||
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
|
||||
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
|
||||
if [[ "$GITHUB_REF" == "main" ]]; then
|
||||
export REPORT_FROM="$PERF_REPORT_DIR"
|
||||
export REPORT_TO=local
|
||||
scripts/generate_and_push_perf_report.sh
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Delete all data but logs
|
||||
shell: bash -ex {0}
|
||||
if: always()
|
||||
run: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
|
||||
- name: Upload python test logs
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
retention-days: 7
|
||||
if-no-files-found: error
|
||||
name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
|
||||
path: /tmp/test_output/
|
||||
|
||||
17
.github/actions/save-coverage-data/action.yml
vendored
17
.github/actions/save-coverage-data/action.yml
vendored
@@ -1,17 +0,0 @@
|
||||
name: 'Merge and upload coverage data'
|
||||
description: 'Compresses and uploads the coverage data as an artifact'
|
||||
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Merge coverage data
|
||||
shell: bash -ex {0}
|
||||
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||
|
||||
- name: Upload coverage data
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
retention-days: 7
|
||||
if-no-files-found: error
|
||||
name: coverage-data-artifact
|
||||
path: /tmp/coverage/
|
||||
8
.github/workflows/benchmarking.yml
vendored
8
.github/workflows/benchmarking.yml
vendored
@@ -26,11 +26,11 @@ jobs:
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: "/usr/pgsql-14"
|
||||
POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
|
||||
|
||||
steps:
|
||||
- name: Checkout zenith repo
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# actions/setup-python@v2 is not working correctly on self-hosted runners
|
||||
# see https://github.com/actions/setup-python/issues/162
|
||||
@@ -88,7 +88,7 @@ jobs:
|
||||
# Plus time needed to initialize the test databases.
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
PLATFORM: "neon-staging"
|
||||
PLATFORM: "zenith-staging"
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||
run: |
|
||||
@@ -96,7 +96,7 @@ jobs:
|
||||
# since it might generate duplicates when calling ingest_perf_test_result.py
|
||||
rm -rf perf-report-staging
|
||||
mkdir -p perf-report-staging
|
||||
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
|
||||
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
|
||||
|
||||
- name: Submit result
|
||||
env:
|
||||
|
||||
484
.github/workflows/build_and_test.yml
vendored
484
.github/workflows/build_and_test.yml
vendored
@@ -1,28 +1,13 @@
|
||||
name: Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
name: build_and_test
|
||||
on: [ push ]
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -ex {0}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
COPT: '-Werror'
|
||||
|
||||
jobs:
|
||||
build-postgres:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
@@ -49,7 +34,7 @@ jobs:
|
||||
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: mold -run make postgres -j$(nproc)
|
||||
run: COPT='-Werror' mold -run make postgres -j$(nproc)
|
||||
|
||||
# actions/cache@v3 does not allow concurrently using the same cache across job steps, so use a separate cache
|
||||
- name: Prepare postgres artifact
|
||||
@@ -67,7 +52,6 @@ jobs:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ build-postgres ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
@@ -101,39 +85,44 @@ jobs:
|
||||
~/.cargo/registry/
|
||||
~/.cargo/git/
|
||||
target/
|
||||
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
|
||||
key: |
|
||||
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
|
||||
key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS="--release --features profiling"
|
||||
fi
|
||||
|
||||
export CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
export RUSTC_WRAPPER=cachepot
|
||||
export AWS_ACCESS_KEY_ID="${{ secrets.AWS_ACCESS_KEY_ID }}"
|
||||
export AWS_SECRET_ACCESS_KEY="${{ secrets.AWS_SECRET_ACCESS_KEY }}"
|
||||
export HOME=/home/runner
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
cachepot -s
|
||||
|
||||
- name: Run cargo test
|
||||
run: |
|
||||
export HOME=/home/runner
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
|
||||
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
|
||||
|
||||
- name: Install rust binaries
|
||||
run: |
|
||||
export HOME=/home/runner
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
@@ -148,36 +137,39 @@ jobs:
|
||||
jq -r '.executable | select(. != null)'
|
||||
)
|
||||
|
||||
mkdir -p /tmp/neon/bin/
|
||||
mkdir -p /tmp/neon/test_bin/
|
||||
mkdir -p /tmp/neon/etc/
|
||||
|
||||
# Keep bloated coverage data files away from the rest of the artifact
|
||||
mkdir -p /tmp/coverage/
|
||||
mkdir -p /tmp/neon/bin
|
||||
mkdir -p /tmp/neon/test_bin
|
||||
mkdir -p /tmp/neon/etc
|
||||
|
||||
# Install target binaries
|
||||
for bin in $binaries; do
|
||||
SRC=target/$BUILD_TYPE/$bin
|
||||
DST=/tmp/neon/bin/$bin
|
||||
cp "$SRC" "$DST"
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/neon/etc/binaries.list
|
||||
done
|
||||
|
||||
# Install test executables and write list of all binaries (for code coverage)
|
||||
# Install test executables (for code coverage)
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
for bin in $binaries; do
|
||||
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
|
||||
done
|
||||
for bin in $test_exe_paths; do
|
||||
SRC=$bin
|
||||
DST=/tmp/neon/test_bin/$(basename $bin)
|
||||
cp "$SRC" "$DST"
|
||||
echo "$DST" >> /tmp/coverage/binaries.list
|
||||
cp $SRC $DST
|
||||
echo $DST >> /tmp/neon/etc/binaries.list
|
||||
done
|
||||
fi
|
||||
|
||||
- name: Install postgres binaries
|
||||
run: cp -a tmp_install /tmp/neon/pg_install
|
||||
|
||||
- name: Merge coverage data
|
||||
run: |
|
||||
export HOME=/home/runner
|
||||
# This will speed up workspace uploads
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/neon/coverage merge
|
||||
fi
|
||||
|
||||
- name: Prepare neon artifact
|
||||
run: tar -C /tmp/neon/ -czf ./neon.tgz .
|
||||
|
||||
@@ -189,17 +181,38 @@ jobs:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||
path: ./neon.tgz
|
||||
|
||||
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
strategy:
|
||||
matrix:
|
||||
rust_toolchain: [ 1.58 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
id: cache_poetry
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run yapf to ensure code format
|
||||
run: poetry run yapf --recursive --diff .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
pg_regress-tests:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ build-neon ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
@@ -218,15 +231,10 @@ jobs:
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
other-tests:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ build-neon ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug, release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
@@ -244,15 +252,10 @@ jobs:
|
||||
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||
test_selection: batch_others
|
||||
|
||||
- name: Merge and upload coverage data
|
||||
if: matrix.build_type == 'debug'
|
||||
uses: ./.github/actions/save-coverage-data
|
||||
|
||||
benchmarks:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ build-neon ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ release ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
@@ -270,373 +273,4 @@ jobs:
|
||||
rust_toolchain: ${{ matrix.rust_toolchain }}
|
||||
test_selection: performance
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
|
||||
# XXX: no coverage data handling here, since benchmarks are run on release builds,
|
||||
# while coverage is currently collected for the debug ones
|
||||
|
||||
coverage-report:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ other-tests, pg_regress-tests ]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build_type: [ debug ]
|
||||
rust_toolchain: [ 1.58 ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Restore cargo deps cache
|
||||
id: cache_cargo
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: |
|
||||
~/.cargo/registry/
|
||||
~/.cargo/git/
|
||||
target/
|
||||
key: v2-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
|
||||
|
||||
- name: Get Neon artifact for restoration
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
|
||||
path: ./neon-artifact/
|
||||
|
||||
- name: Extract Neon artifact
|
||||
run: |
|
||||
mkdir -p /tmp/neon/
|
||||
tar -xf ./neon-artifact/neon.tgz -C /tmp/neon/
|
||||
rm -rf ./neon-artifact/
|
||||
|
||||
- name: Restore coverage data
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: coverage-data-artifact
|
||||
path: /tmp/coverage/
|
||||
|
||||
- name: Merge coverage data
|
||||
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
|
||||
|
||||
- name: Build and upload coverage report
|
||||
run: |
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
|
||||
|
||||
scripts/coverage \
|
||||
--dir=/tmp/coverage report \
|
||||
--input-objects=/tmp/coverage/binaries.list \
|
||||
--commit-url=$COMMIT_URL \
|
||||
--format=github
|
||||
|
||||
REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
|
||||
|
||||
# Add link to the coverage report to the commit
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"success\",
|
||||
\"context\": \"neon-coverage\",
|
||||
\"description\": \"Coverage report is ready\",
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
trigger-e2e-tests:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ build-neon ]
|
||||
steps:
|
||||
- name: Set PR's status to pending and request a remote CI test
|
||||
run: |
|
||||
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
|
||||
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
|
||||
|
||||
REMOTE_REPO="${{ github.repository_owner }}/cloud"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
|
||||
curl -f -X POST \
|
||||
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
|
||||
-H "Accept: application/vnd.github.v3+json" \
|
||||
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
|
||||
--data \
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"commit_hash\": \"$COMMIT_SHA\",
|
||||
\"remote_repo\": \"${{ github.repository }}\"
|
||||
}
|
||||
}"
|
||||
|
||||
docker-image:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ pg_regress-tests, other-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
with:
|
||||
driver: docker
|
||||
|
||||
- name: Get build tag
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "::set-output name=tag::$(git rev-list --count HEAD)"
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
id: build-tag
|
||||
|
||||
- name: Get legacy build tag
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "::set-output name=tag::latest
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "::set-output name=tag::release
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
id: legacy-build-tag
|
||||
|
||||
- name: Build neon Docker image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION="${{github.sha}}"
|
||||
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
|
||||
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
|
||||
pull: true
|
||||
push: true
|
||||
tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
|
||||
|
||||
docker-image-compute:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
needs: [ pg_regress-tests, other-tests ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
outputs:
|
||||
build-tag: ${{steps.build-tag.outputs.tag}}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
with:
|
||||
driver: docker
|
||||
|
||||
- name: Get build tag
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "::set-output name=tag::$(git rev-list --count HEAD)"
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
id: build-tag
|
||||
|
||||
- name: Get legacy build tag
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
echo "::set-output name=tag::latest
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
echo "::set-output name=tag::release
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
id: legacy-build-tag
|
||||
|
||||
- name: Build compute-tools Docker image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION="${{github.sha}}"
|
||||
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
|
||||
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
|
||||
push: false
|
||||
file: Dockerfile.compute-tools
|
||||
tags: neondatabase/compute-tools:local
|
||||
|
||||
- name: Push compute-tools Docker image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: .
|
||||
build-args: |
|
||||
GIT_VERSION="${{github.sha}}"
|
||||
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
|
||||
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
|
||||
push: true
|
||||
file: Dockerfile.compute-tools
|
||||
tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
|
||||
|
||||
- name: Build compute-node Docker image
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
context: ./vendor/postgres/
|
||||
build-args:
|
||||
COMPUTE_TOOLS_TAG=local
|
||||
push: true
|
||||
tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
|
||||
|
||||
calculate-deploy-targets:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
outputs:
|
||||
matrix-include: ${{ steps.set-matrix.outputs.include }}
|
||||
steps:
|
||||
- id: set-matrix
|
||||
run: |
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
|
||||
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
|
||||
echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
|
||||
echo "::set-output name=include::[$PRODUCTION]"
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
deploy:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
# We need both storage **and** compute images for deploy, because control plane
|
||||
# picks the compute version based on the storage version. If it notices a fresh
|
||||
# storage it may bump the compute version. And if compute image failed to build
|
||||
# it may break things badly.
|
||||
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
strategy:
|
||||
matrix:
|
||||
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Setup ansible
|
||||
run: |
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
|
||||
- name: Redeploy
|
||||
run: |
|
||||
cd "$(pwd)/.github/ansible"
|
||||
|
||||
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
|
||||
./get_binaries.sh
|
||||
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
|
||||
RELEASE=true ./get_binaries.sh
|
||||
else
|
||||
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
eval $(ssh-agent)
|
||||
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-proxy:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
# Compute image isn't strictly required for proxy deploy, but let's still wait for it
|
||||
# to run all deploy jobs consistently.
|
||||
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
|
||||
if: |
|
||||
(github.ref_name == 'main' || github.ref_name == 'release') &&
|
||||
github.event_name != 'workflow_dispatch'
|
||||
strategy:
|
||||
matrix:
|
||||
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
|
||||
env:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Store kubeconfig file
|
||||
run: |
|
||||
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
|
||||
- name: Setup helm v3
|
||||
run: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
|
||||
- name: Re-deploy proxy
|
||||
run: |
|
||||
DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
|
||||
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
|
||||
# save_perf_report: true
|
||||
|
||||
71
.github/workflows/pg_clients.yml
vendored
71
.github/workflows/pg_clients.yml
vendored
@@ -1,71 +0,0 @@
|
||||
name: Test Postgres client libraries
|
||||
|
||||
on:
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '23 02 * * *' # run once a day, timezone is utc
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
test-postgres-client-libs:
|
||||
runs-on: [ ubuntu-latest ]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.9
|
||||
|
||||
- name: Install Poetry
|
||||
uses: snok/install-poetry@v1
|
||||
|
||||
- name: Cache poetry deps
|
||||
id: cache_poetry
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
shell: bash -ex {0}
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run pytest
|
||||
env:
|
||||
REMOTE_ENV: 1
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
TEST_OUTPUT: /tmp/test_output
|
||||
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
|
||||
shell: bash -ex {0}
|
||||
run: |
|
||||
# Test framework expects we have psql binary;
|
||||
# but since we don't really need it in this test, let's mock it
|
||||
mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
|
||||
./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
-m "remote_cluster" \
|
||||
-rA "test_runner/pg_clients"
|
||||
|
||||
- name: Post to a Slack channel
|
||||
if: failure()
|
||||
id: slack
|
||||
uses: slackapi/slack-github-action@v1
|
||||
with:
|
||||
channel-id: "C033QLM5P7D" # dev-staging-stream
|
||||
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
|
||||
env:
|
||||
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
|
||||
@@ -1,4 +1,4 @@
|
||||
name: Check code style and build
|
||||
name: Build and Test
|
||||
|
||||
on:
|
||||
push:
|
||||
@@ -6,27 +6,15 @@ on:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -ex {0}
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
RUST_BACKTRACE: 1
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
regression-check:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
# If we want to duplicate this job for different
|
||||
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
|
||||
rust_toolchain: [1.58]
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
timeout-minutes: 50
|
||||
timeout-minutes: 30
|
||||
name: run regression test suite
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
@@ -104,30 +92,5 @@ jobs:
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
|
||||
- name: Ensure all project builds
|
||||
run: cargo build --all --all-targets
|
||||
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, Linux, k8s-runner ]
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
submodules: false
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Cache poetry deps
|
||||
id: cache_poetry
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run yapf to ensure code format
|
||||
run: poetry run yapf --recursive --diff .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
- name: Run cargo test
|
||||
run: cargo test --all --all-targets
|
||||
7
Cargo.lock
generated
7
Cargo.lock
generated
@@ -461,7 +461,6 @@ dependencies = [
|
||||
"tar",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"url",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -2151,7 +2150,7 @@ dependencies = [
|
||||
"serde",
|
||||
"thiserror",
|
||||
"utils",
|
||||
"wal_craft",
|
||||
"wal_generate",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
@@ -3753,16 +3752,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "wal_craft"
|
||||
name = "wal_generate"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap 3.0.14",
|
||||
"env_logger",
|
||||
"log",
|
||||
"once_cell",
|
||||
"postgres",
|
||||
"postgres_ffi",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
|
||||
10
Dockerfile
10
Dockerfile
@@ -1,5 +1,5 @@
|
||||
# Build Postgres
|
||||
FROM neondatabase/rust:1.58 AS pg-build
|
||||
FROM zimg/rust:1.58 AS pg-build
|
||||
WORKDIR /pg
|
||||
|
||||
USER root
|
||||
@@ -14,7 +14,7 @@ RUN set -e \
|
||||
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
||||
|
||||
# Build zenith binaries
|
||||
FROM neondatabase/rust:1.58 AS build
|
||||
FROM zimg/rust:1.58 AS build
|
||||
ARG GIT_VERSION=local
|
||||
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
@@ -46,9 +46,9 @@ RUN set -e \
|
||||
&& useradd -d /data zenith \
|
||||
&& chown -R zenith:zenith /data
|
||||
|
||||
COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin
|
||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
|
||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
|
||||
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin
|
||||
|
||||
COPY --from=pg-build /pg/tmp_install/ /usr/local/
|
||||
COPY --from=pg-build /postgres_install.tar.gz /data/
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .circle/config.yml
|
||||
FROM neondatabase/rust:1.58 AS rust-build
|
||||
FROM zimg/rust:1.58 AS rust-build
|
||||
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
ARG AWS_ACCESS_KEY_ID
|
||||
@@ -15,4 +15,4 @@ RUN set -e \
|
||||
# Final image that only has one binary
|
||||
FROM debian:buster-slim
|
||||
|
||||
COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
|
||||
39
Makefile
39
Makefile
@@ -1,8 +1,3 @@
|
||||
ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
|
||||
# Where to install Postgres, default is ./tmp_install, maybe useful for package managers
|
||||
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install
|
||||
|
||||
# Seccomp BPF is only available for Linux
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
@@ -60,55 +55,55 @@ zenith: postgres-headers
|
||||
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
|
||||
|
||||
### PostgreSQL parts
|
||||
$(POSTGRES_INSTALL_DIR)/build/config.status:
|
||||
tmp_install/build/config.status:
|
||||
+@echo "Configuring postgres build"
|
||||
mkdir -p $(POSTGRES_INSTALL_DIR)/build
|
||||
(cd $(POSTGRES_INSTALL_DIR)/build && \
|
||||
$(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
mkdir -p tmp_install/build
|
||||
(cd tmp_install/build && \
|
||||
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
|
||||
$(PG_CONFIGURE_OPTS) \
|
||||
$(SECCOMP) \
|
||||
--prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log)
|
||||
--prefix=$(abspath tmp_install) > configure.log)
|
||||
|
||||
# nicer alias for running 'configure'
|
||||
.PHONY: postgres-configure
|
||||
postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status
|
||||
postgres-configure: tmp_install/build/config.status
|
||||
|
||||
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include
|
||||
# Install the PostgreSQL header files into tmp_install/include
|
||||
.PHONY: postgres-headers
|
||||
postgres-headers: postgres-configure
|
||||
+@echo "Installing PostgreSQL headers"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install
|
||||
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
|
||||
|
||||
# Compile and install PostgreSQL and contrib/neon
|
||||
.PHONY: postgres
|
||||
postgres: postgres-configure \
|
||||
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
|
||||
+@echo "Compiling PostgreSQL"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
|
||||
+@echo "Compiling contrib/neon"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
|
||||
$(MAKE) -C tmp_install/build/contrib/neon install
|
||||
+@echo "Compiling contrib/neon_test_utils"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
|
||||
$(MAKE) -C tmp_install/build/contrib/neon_test_utils install
|
||||
+@echo "Compiling pg_buffercache"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
|
||||
$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
|
||||
+@echo "Compiling pageinspect"
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install
|
||||
$(MAKE) -C tmp_install/build/contrib/pageinspect install
|
||||
|
||||
|
||||
.PHONY: postgres-clean
|
||||
postgres-clean:
|
||||
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean
|
||||
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
|
||||
|
||||
# This doesn't remove the effects of 'configure'.
|
||||
.PHONY: clean
|
||||
clean:
|
||||
cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
|
||||
cd tmp_install/build && $(MAKE) clean
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
# This removes everything
|
||||
.PHONY: distclean
|
||||
distclean:
|
||||
rm -rf $(POSTGRES_INSTALL_DIR)
|
||||
rm -rf tmp_install
|
||||
$(CARGO_CMD_PREFIX) cargo clean
|
||||
|
||||
.PHONY: fmt
|
||||
@@ -117,4 +112,4 @@ fmt:
|
||||
|
||||
.PHONY: setup-pre-commit-hook
|
||||
setup-pre-commit-hook:
|
||||
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
|
||||
ln -s -f ../../pre-commit.py .git/hooks/pre-commit
|
||||
|
||||
@@ -53,7 +53,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
1. Install XCode and dependencies
|
||||
```
|
||||
xcode-select --install
|
||||
brew install protobuf etcd openssl
|
||||
brew install protobuf etcd
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
|
||||
@@ -18,5 +18,4 @@ serde_json = "1"
|
||||
tar = "0.4"
|
||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
url = "2.2.2"
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -33,7 +33,7 @@ use std::process::exit;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use log::{error, info};
|
||||
@@ -45,7 +45,6 @@ use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
use url::Url;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `utils::logging` later
|
||||
@@ -132,7 +131,7 @@ fn main() -> Result<()> {
|
||||
|
||||
let compute_state = ComputeNode {
|
||||
start_time: Utc::now(),
|
||||
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
|
||||
connstr: connstr.to_string(),
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use log::error;
|
||||
use postgres::Client;
|
||||
@@ -21,8 +23,9 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
|
||||
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
|
||||
pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
|
||||
let connstr = &compute.connstr;
|
||||
let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
|
||||
if client.is_closed() {
|
||||
return Err(anyhow!("connection to postgres closed"));
|
||||
}
|
||||
|
||||
@@ -35,8 +35,7 @@ use crate::spec::*;
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
pub start_time: DateTime<Utc>,
|
||||
// Url type maintains proper escaping
|
||||
pub connstr: url::Url,
|
||||
pub connstr: String,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub spec: ComputeSpec,
|
||||
@@ -269,33 +268,28 @@ impl ComputeNode {
|
||||
// In this case we need to connect with old `zenith_admin`name
|
||||
// and create new user. We cannot simply rename connected user,
|
||||
// but we can create a new one and grant it all privileges.
|
||||
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
|
||||
let mut client = match Client::connect(&self.connstr, NoTls) {
|
||||
Err(e) => {
|
||||
info!(
|
||||
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
|
||||
e
|
||||
);
|
||||
let mut zenith_admin_connstr = self.connstr.clone();
|
||||
let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1);
|
||||
|
||||
zenith_admin_connstr
|
||||
.set_username("zenith_admin")
|
||||
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
|
||||
|
||||
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
|
||||
let mut client = Client::connect(&zenith_admin_connstr, NoTls)?;
|
||||
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
|
||||
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
|
||||
drop(client);
|
||||
|
||||
// reconnect with connsting with expected name
|
||||
Client::connect(self.connstr.as_str(), NoTls)?
|
||||
Client::connect(&self.connstr, NoTls)?
|
||||
}
|
||||
Ok(client) => client,
|
||||
};
|
||||
|
||||
handle_roles(&self.spec, &mut client)?;
|
||||
handle_databases(&self.spec, &mut client)?;
|
||||
handle_role_deletions(self, &mut client)?;
|
||||
handle_grants(self, &mut client)?;
|
||||
handle_grants(&self.spec, &mut client)?;
|
||||
create_writablity_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
|
||||
@@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
|
||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
// Then update it in the shared state. This function never errors out.
|
||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||
fn watch_compute_activity(compute: &ComputeNode) {
|
||||
fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.as_str();
|
||||
let connstr = compute.connstr.clone();
|
||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||
let mut client = Client::connect(connstr, NoTls);
|
||||
let mut client = Client::connect(&connstr, NoTls);
|
||||
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
||||
|
||||
info!("watching Postgres activity at {}", connstr);
|
||||
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
info!("connection to postgres closed, trying to reconnect");
|
||||
|
||||
// Connection is closed, reconnect and try again.
|
||||
client = Client::connect(connstr, NoTls);
|
||||
client = Client::connect(&connstr, NoTls);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
|
||||
debug!("cannot connect to postgres: {}, retrying", e);
|
||||
|
||||
// Establish a new connection and try again.
|
||||
client = Client::connect(connstr, NoTls);
|
||||
client = Client::connect(&connstr, NoTls);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::fmt::Write;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
@@ -139,11 +138,9 @@ impl Role {
|
||||
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
||||
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
||||
if pass.starts_with("SCRAM-SHA-256") {
|
||||
write!(params, " PASSWORD '{pass}'")
|
||||
.expect("String is documented to not to error during write operations");
|
||||
params.push_str(&format!(" PASSWORD '{}'", pass));
|
||||
} else {
|
||||
write!(params, " PASSWORD 'md5{pass}'")
|
||||
.expect("String is documented to not to error during write operations");
|
||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
||||
}
|
||||
} else {
|
||||
params.push_str(" PASSWORD NULL");
|
||||
@@ -161,8 +158,7 @@ impl Database {
|
||||
/// it may require a proper quoting too.
|
||||
pub fn to_pg_options(&self) -> String {
|
||||
let mut params: String = self.options.as_pg_options();
|
||||
write!(params, " OWNER {}", &self.owner.quote())
|
||||
.expect("String is documented to not to error during write operations");
|
||||
params.push_str(&format!(" OWNER {}", &self.owner.quote()));
|
||||
|
||||
params
|
||||
}
|
||||
@@ -248,20 +244,18 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
|
||||
bail!("Postgres exited unexpectedly with code {}", code);
|
||||
}
|
||||
|
||||
// Check that we can open pid file first.
|
||||
if let Ok(file) = File::open(&pid_path) {
|
||||
let file = BufReader::new(file);
|
||||
let last_line = file.lines().last();
|
||||
if pid_path.exists() {
|
||||
let file = BufReader::new(File::open(&pid_path)?);
|
||||
let status = file
|
||||
.lines()
|
||||
.last()
|
||||
.unwrap()
|
||||
.unwrap_or_else(|_| "unknown".to_string());
|
||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
||||
|
||||
// Pid file could be there and we could read it, but it could be empty, for example.
|
||||
if let Some(Ok(line)) = last_line {
|
||||
let status = line.trim();
|
||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
||||
|
||||
// Now Postgres is ready to accept connections
|
||||
if status == "ready" && can_connect {
|
||||
break;
|
||||
}
|
||||
// Now Postgres is ready to accept connections
|
||||
if status.trim() == "ready" && can_connect {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
use std::path::Path;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::Result;
|
||||
use log::{info, log_enabled, warn, Level};
|
||||
use postgres::error::SqlState;
|
||||
use postgres::{Client, NoTls};
|
||||
use postgres::Client;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::config;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
@@ -99,13 +97,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
// Process delta operations first
|
||||
if let Some(ops) = &spec.delta_operations {
|
||||
info!("processing role renames");
|
||||
info!("processing delta operations on roles");
|
||||
for op in ops {
|
||||
match op.action.as_ref() {
|
||||
// We do not check either role exists or not,
|
||||
// Postgres will take care of it for us
|
||||
"delete_role" => {
|
||||
// no-op now, roles will be deleted at the end of configuration
|
||||
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
||||
|
||||
warn!("deleting role '{}'", &op.name);
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
// Renaming role drops its password, since role name is
|
||||
// Renaming role drops its password, since tole name is
|
||||
// used as a salt there. It is important that this role
|
||||
// is recorded with a new `name` in the `roles` list.
|
||||
// Follow up roles update will set the new password.
|
||||
@@ -179,7 +182,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
|
||||
let grant_query = format!(
|
||||
"GRANT pg_read_all_data, pg_write_all_data TO {}",
|
||||
"grant pg_read_all_data, pg_write_all_data to {}",
|
||||
name.quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
@@ -194,70 +197,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reassign all dependent objects and delete requested roles.
|
||||
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
let spec = &node.spec;
|
||||
|
||||
// First, reassign all dependent objects to db owners.
|
||||
if let Some(ops) = &spec.delta_operations {
|
||||
info!("reassigning dependent objects of to-be-deleted roles");
|
||||
for op in ops {
|
||||
if op.action == "delete_role" {
|
||||
reassign_owned_objects(node, &op.name)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second, proceed with role deletions.
|
||||
let mut xact = client.transaction()?;
|
||||
if let Some(ops) = &spec.delta_operations {
|
||||
info!("processing role deletions");
|
||||
for op in ops {
|
||||
// We do not check either role exists or not,
|
||||
// Postgres will take care of it for us
|
||||
if op.action == "delete_role" {
|
||||
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
|
||||
|
||||
warn!("deleting role '{}'", &op.name);
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Reassign all owned objects in all databases to the owner of the database.
|
||||
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
|
||||
for db in &node.spec.cluster.databases {
|
||||
if db.owner != *role_name {
|
||||
let mut connstr = node.connstr.clone();
|
||||
// database name is always the last and the only component of the path
|
||||
connstr.set_path(&db.name);
|
||||
|
||||
let mut client = Client::connect(connstr.as_str(), NoTls)?;
|
||||
|
||||
// This will reassign all dependent objects to the db owner
|
||||
let reassign_query = format!(
|
||||
"REASSIGN OWNED BY {} TO {}",
|
||||
role_name.quote(),
|
||||
db.owner.quote()
|
||||
);
|
||||
info!(
|
||||
"reassigning objects owned by '{}' in db '{}' to '{}'",
|
||||
role_name, &db.name, &db.owner
|
||||
);
|
||||
client.simple_query(&reassign_query)?;
|
||||
|
||||
// This now will only drop privileges of the role
|
||||
let drop_query = format!("DROP OWNED BY {}", role_name.quote());
|
||||
client.simple_query(&drop_query)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// It follows mostly the same logic as `handle_roles()` excepting that we
|
||||
/// does not use an explicit transactions block, since major database operations
|
||||
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
||||
@@ -350,66 +289,23 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
|
||||
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
|
||||
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
|
||||
let spec = &node.spec;
|
||||
|
||||
// Grant CREATE ON DATABASE to the database owner
|
||||
// to allow clients create trusted extensions.
|
||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
// We now have a separate `web_access` role to connect to the database
|
||||
// via the web interface and proxy link auth. And also we grant a
|
||||
// read / write all data privilege to every role. So also grant
|
||||
// create to everyone.
|
||||
// XXX: later we should stop messing with Postgres ACL in such horrible
|
||||
// ways.
|
||||
let roles = spec
|
||||
.cluster
|
||||
.roles
|
||||
.iter()
|
||||
.map(|r| r.name.quote())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
let dbname = &db.name;
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.quote(),
|
||||
roles.join(", ")
|
||||
db.owner.quote()
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
// Do some per-database access adjustments. We'd better do this at db creation time,
|
||||
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
|
||||
// atomically.
|
||||
let mut db_connstr = node.connstr.clone();
|
||||
for db in &node.spec.cluster.databases {
|
||||
// database name is always the last and the only component of the path
|
||||
db_connstr.set_path(&db.name);
|
||||
|
||||
let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
|
||||
|
||||
// This will only change ownership on the schema itself, not the objects
|
||||
// inside it. Without it owner of the `public` schema will be `cloud_admin`
|
||||
// and database owner cannot do anything with it.
|
||||
let alter_query = format!("ALTER SCHEMA public OWNER TO {}", db.owner.quote());
|
||||
let res = db_client.simple_query(&alter_query);
|
||||
|
||||
if let Err(e) = res {
|
||||
if e.code() == Some(&SqlState::INVALID_SCHEMA_NAME) {
|
||||
// This is OK, db just don't have a `public` schema.
|
||||
// Probably user dropped it manually.
|
||||
info!("no 'public' schema found in the database {}", db.name);
|
||||
} else {
|
||||
// Something different happened, propagate the error
|
||||
return Err(anyhow!(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -403,6 +403,16 @@ impl LocalEnv {
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
||||
binary,
|
||||
self.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
@@ -411,6 +421,12 @@ impl LocalEnv {
|
||||
);
|
||||
}
|
||||
}
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
fs::create_dir(&base_path)?;
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
[dev-dependencies]
|
||||
env_logger = "0.9"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
wal_craft = { path = "wal_craft" }
|
||||
wal_generate = { path = "wal_generate" }
|
||||
|
||||
[build-dependencies]
|
||||
bindgen = "0.59.1"
|
||||
|
||||
@@ -2,7 +2,6 @@ extern crate bindgen;
|
||||
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
|
||||
use bindgen::callbacks::ParseCallbacks;
|
||||
|
||||
@@ -46,43 +45,6 @@ fn main() {
|
||||
// Tell cargo to invalidate the built crate whenever the wrapper changes
|
||||
println!("cargo:rerun-if-changed=pg_control_ffi.h");
|
||||
|
||||
// Finding the location of C headers for the Postgres server:
|
||||
// - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
|
||||
// - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/tmp_install/include/postgresql/server`
|
||||
let mut pg_install_dir: PathBuf;
|
||||
if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR") {
|
||||
pg_install_dir = postgres_install_dir.into();
|
||||
} else {
|
||||
pg_install_dir = PathBuf::from("tmp_install")
|
||||
}
|
||||
|
||||
if pg_install_dir.is_relative() {
|
||||
let cwd = env::current_dir().unwrap();
|
||||
pg_install_dir = cwd.join("..").join("..").join(pg_install_dir);
|
||||
}
|
||||
|
||||
let pg_config_bin = pg_install_dir.join("bin").join("pg_config");
|
||||
let inc_server_path: String = if pg_config_bin.exists() {
|
||||
let output = Command::new(pg_config_bin)
|
||||
.arg("--includedir-server")
|
||||
.output()
|
||||
.expect("failed to execute `pg_config --includedir-server`");
|
||||
|
||||
if !output.status.success() {
|
||||
panic!("`pg_config --includedir-server` failed")
|
||||
}
|
||||
|
||||
String::from_utf8(output.stdout).unwrap().trim_end().into()
|
||||
} else {
|
||||
pg_install_dir
|
||||
.join("include")
|
||||
.join("postgresql")
|
||||
.join("server")
|
||||
.into_os_string()
|
||||
.into_string()
|
||||
.unwrap()
|
||||
};
|
||||
|
||||
// The bindgen::Builder is the main entry point
|
||||
// to bindgen, and lets you build up options for
|
||||
// the resulting bindings.
|
||||
@@ -119,7 +81,15 @@ fn main() {
|
||||
// explicit padding fields.
|
||||
.explicit_padding(true)
|
||||
//
|
||||
.clang_arg(format!("-I{inc_server_path}"))
|
||||
// Path the server include dir. It is in tmp_install/include/server, if you did
|
||||
// "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
|
||||
// and used DESTDIR to move it into tmp_install, then it's in
|
||||
// tmp_install/include/postgres/server
|
||||
// 'pg_config --includedir-server' would perhaps be the more proper way to find it,
|
||||
// but this will do for now.
|
||||
//
|
||||
.clang_arg("-I../../tmp_install/include/server")
|
||||
.clang_arg("-I../../tmp_install/include/postgresql/server")
|
||||
//
|
||||
// Finish the builder and generate the bindings.
|
||||
//
|
||||
|
||||
@@ -82,17 +82,7 @@ impl WalStreamDecoder {
|
||||
// that cross page boundaries.
|
||||
loop {
|
||||
// parse and verify page boundaries as we go
|
||||
if self.padlen > 0 {
|
||||
// We should first skip padding, as we may have to skip some page headers if we're processing the XLOG_SWITCH record.
|
||||
if self.inputbuf.remaining() < self.padlen as usize {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// skip padding
|
||||
self.inputbuf.advance(self.padlen as usize);
|
||||
self.lsn += self.padlen as u64;
|
||||
self.padlen = 0;
|
||||
} else if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
|
||||
// parse long header
|
||||
|
||||
if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
|
||||
@@ -138,6 +128,15 @@ impl WalStreamDecoder {
|
||||
|
||||
self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
|
||||
continue;
|
||||
} else if self.padlen > 0 {
|
||||
if self.inputbuf.remaining() < self.padlen as usize {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
// skip padding
|
||||
self.inputbuf.advance(self.padlen as usize);
|
||||
self.lsn += self.padlen as u64;
|
||||
self.padlen = 0;
|
||||
} else if self.contlen == 0 {
|
||||
assert!(self.recordbuf.is_empty());
|
||||
|
||||
@@ -227,10 +226,10 @@ impl WalStreamDecoder {
|
||||
self.padlen = self.lsn.calc_padding(8u32) as u32;
|
||||
}
|
||||
|
||||
// We should return LSN of the next record, not the last byte of this record or
|
||||
// the byte immediately after. Note that this handles both XLOG_SWITCH and usual
|
||||
// records, the former "spans" until the next WAL segment (see test_xlog_switch).
|
||||
let result = (self.lsn + self.padlen as u64, recordbuf);
|
||||
// Always align resulting LSN on 0x8 boundary -- that is important for getPage()
|
||||
// and WalReceiver integration. Since this code is used both for WalReceiver and
|
||||
// initial WAL import let's force alignment right here.
|
||||
let result = (self.lsn.align(), recordbuf);
|
||||
Ok(Some(result))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -597,18 +597,19 @@ mod tests {
|
||||
fn init_logging() {
|
||||
let _ = env_logger::Builder::from_env(
|
||||
env_logger::Env::default()
|
||||
.default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"),
|
||||
.default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"),
|
||||
)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
}
|
||||
|
||||
fn test_end_of_wal<C: wal_craft::Crafter>(
|
||||
fn test_end_of_wal(
|
||||
test_name: &str,
|
||||
generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result<postgres::types::PgLsn>,
|
||||
expected_end_of_wal_non_partial: Lsn,
|
||||
last_segment: &str,
|
||||
) {
|
||||
use wal_craft::*;
|
||||
use wal_generate::*;
|
||||
// 1. Generate some WAL
|
||||
let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("..")
|
||||
@@ -621,9 +622,9 @@ mod tests {
|
||||
fs::remove_dir_all(&cfg.datadir).unwrap();
|
||||
}
|
||||
cfg.initdb().unwrap();
|
||||
let srv = cfg.start_server().unwrap();
|
||||
let mut srv = cfg.start_server().unwrap();
|
||||
let expected_wal_end: Lsn =
|
||||
u64::from(C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
|
||||
u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
|
||||
srv.kill();
|
||||
|
||||
// 2. Pick WAL generated by initdb
|
||||
@@ -680,8 +681,9 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_simple() {
|
||||
init_logging();
|
||||
test_end_of_wal::<wal_craft::Simple>(
|
||||
test_end_of_wal(
|
||||
"test_find_end_of_wal_simple",
|
||||
wal_generate::generate_simple,
|
||||
"0/2000000".parse::<Lsn>().unwrap(),
|
||||
"000000010000000000000001",
|
||||
);
|
||||
@@ -690,8 +692,9 @@ mod tests {
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
|
||||
init_logging();
|
||||
test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
|
||||
test_end_of_wal(
|
||||
"test_find_end_of_wal_crossing_segment_followed_by_small_one",
|
||||
wal_generate::generate_wal_record_crossing_segment_followed_by_small_one,
|
||||
"0/3000000".parse::<Lsn>().unwrap(),
|
||||
"000000010000000000000002",
|
||||
);
|
||||
@@ -701,8 +704,9 @@ mod tests {
|
||||
#[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
|
||||
pub fn test_find_end_of_wal_last_crossing_segment() {
|
||||
init_logging();
|
||||
test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
|
||||
test_end_of_wal(
|
||||
"test_find_end_of_wal_last_crossing_segment",
|
||||
wal_generate::generate_last_wal_record_crossing_segment,
|
||||
"0/3000000".parse::<Lsn>().unwrap(),
|
||||
"000000010000000000000002",
|
||||
);
|
||||
|
||||
@@ -1,100 +0,0 @@
|
||||
use anyhow::*;
|
||||
use clap::{App, Arg, ArgMatches};
|
||||
use std::str::FromStr;
|
||||
use wal_craft::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
|
||||
.init();
|
||||
let type_arg = &Arg::new("type")
|
||||
.takes_value(true)
|
||||
.help("Type of WAL to craft")
|
||||
.possible_values([
|
||||
Simple::NAME,
|
||||
LastWalRecordXlogSwitch::NAME,
|
||||
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
|
||||
WalRecordCrossingSegmentFollowedBySmallOne::NAME,
|
||||
LastWalRecordCrossingSegment::NAME,
|
||||
])
|
||||
.required(true);
|
||||
let arg_matches = App::new("Postgres WAL crafter")
|
||||
.about("Crafts Postgres databases with specific WAL properties")
|
||||
.subcommand(
|
||||
App::new("print-postgres-config")
|
||||
.about("Print the configuration required for PostgreSQL server before running this script")
|
||||
)
|
||||
.subcommand(
|
||||
App::new("with-initdb")
|
||||
.about("Craft WAL in a new data directory first initialized with initdb")
|
||||
.arg(type_arg)
|
||||
.arg(
|
||||
Arg::new("datadir")
|
||||
.takes_value(true)
|
||||
.help("Data directory for the Postgres server")
|
||||
.required(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-distrib-dir")
|
||||
.long("pg-distrib-dir")
|
||||
.takes_value(true)
|
||||
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
|
||||
.default_value("/usr/local")
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
App::new("in-existing")
|
||||
.about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
|
||||
.arg(type_arg)
|
||||
.arg(
|
||||
Arg::new("connection")
|
||||
.takes_value(true)
|
||||
.help("Connection string to the Postgres database to populate")
|
||||
.required(true)
|
||||
)
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let wal_craft = |arg_matches: &ArgMatches, client| {
|
||||
let lsn = match arg_matches.value_of("type").unwrap() {
|
||||
Simple::NAME => Simple::craft(client)?,
|
||||
LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
|
||||
LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
|
||||
LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)?
|
||||
}
|
||||
WalRecordCrossingSegmentFollowedBySmallOne::NAME => {
|
||||
WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
|
||||
}
|
||||
LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
|
||||
a => panic!("Unknown --type argument: {}", a),
|
||||
};
|
||||
println!("end_of_wal = {}", lsn);
|
||||
Ok(())
|
||||
};
|
||||
|
||||
match arg_matches.subcommand() {
|
||||
None => panic!("No subcommand provided"),
|
||||
Some(("print-postgres-config", _)) => {
|
||||
for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
|
||||
println!("{}", cfg);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Some(("with-initdb", arg_matches)) => {
|
||||
let cfg = Conf {
|
||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
||||
};
|
||||
cfg.initdb()?;
|
||||
let srv = cfg.start_server()?;
|
||||
wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?;
|
||||
srv.kill();
|
||||
Ok(())
|
||||
}
|
||||
Some(("in-existing", arg_matches)) => wal_craft(
|
||||
arg_matches,
|
||||
&mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
|
||||
.connect(postgres::NoTls)?,
|
||||
),
|
||||
Some(_) => panic!("Unknown subcommand"),
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,5 @@
|
||||
[package]
|
||||
name = "wal_craft"
|
||||
name = "wal_generate"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
@@ -10,7 +10,5 @@ anyhow = "1.0"
|
||||
clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
log = "0.4"
|
||||
once_cell = "1.8.0"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres_ffi = { path = "../" }
|
||||
tempfile = "3.2"
|
||||
58
libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
Normal file
58
libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
use anyhow::*;
|
||||
use clap::{App, Arg};
|
||||
use wal_generate::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
env_logger::Builder::from_env(
|
||||
env_logger::Env::default().default_filter_or("wal_generate=info"),
|
||||
)
|
||||
.init();
|
||||
let arg_matches = App::new("Postgres WAL generator")
|
||||
.about("Generates Postgres databases with specific WAL properties")
|
||||
.arg(
|
||||
Arg::new("datadir")
|
||||
.short('D')
|
||||
.long("datadir")
|
||||
.takes_value(true)
|
||||
.help("Data directory for the Postgres server")
|
||||
.required(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-distrib-dir")
|
||||
.long("pg-distrib-dir")
|
||||
.takes_value(true)
|
||||
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
|
||||
.default_value("/usr/local")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("type")
|
||||
.long("type")
|
||||
.takes_value(true)
|
||||
.help("Type of WAL to generate")
|
||||
.possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
|
||||
.required(true)
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let cfg = Conf {
|
||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
||||
};
|
||||
cfg.initdb()?;
|
||||
let mut srv = cfg.start_server()?;
|
||||
let lsn = match arg_matches.value_of("type").unwrap() {
|
||||
"simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
|
||||
"last_wal_record_crossing_segment" => {
|
||||
generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
|
||||
}
|
||||
"wal_record_crossing_segment_followed_by_small_one" => {
|
||||
generate_wal_record_crossing_segment_followed_by_small_one(
|
||||
&mut srv.connect_with_timeout()?,
|
||||
)?
|
||||
}
|
||||
a => panic!("Unknown --type argument: {}", a),
|
||||
};
|
||||
println!("end_of_wal = {}", lsn);
|
||||
srv.kill();
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,12 +1,8 @@
|
||||
use anyhow::*;
|
||||
use core::time::Duration;
|
||||
use log::*;
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres::types::PgLsn;
|
||||
use postgres::Client;
|
||||
use postgres_ffi::xlog_utils::{
|
||||
XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
};
|
||||
use std::cmp::Ordering;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -26,16 +22,6 @@ pub struct PostgresServer {
|
||||
client_config: postgres::Config,
|
||||
}
|
||||
|
||||
pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
|
||||
vec![
|
||||
"wal_keep_size=50MB", // Ensure old WAL is not removed
|
||||
"shared_preload_libraries=neon", // can only be loaded at startup
|
||||
// Disable background processes as much as possible
|
||||
"wal_writer_delay=10s",
|
||||
"autovacuum=off",
|
||||
]
|
||||
});
|
||||
|
||||
impl Conf {
|
||||
fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
@@ -99,8 +85,12 @@ impl Conf {
|
||||
.arg(unix_socket_dir_path.as_os_str())
|
||||
.arg("-D")
|
||||
.arg(self.datadir.as_os_str())
|
||||
.args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
|
||||
.args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
|
||||
.args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
|
||||
.args(&["-c", "shared_preload_libraries=neon"]) // can only be loaded at startup
|
||||
// Disable background processes as much as possible
|
||||
.args(&["-c", "wal_writer_delay=10s"])
|
||||
.args(&["-c", "autovacuum=off"])
|
||||
.stderr(Stdio::from(log_file))
|
||||
.spawn()?;
|
||||
let server = PostgresServer {
|
||||
@@ -154,7 +144,7 @@ impl PostgresServer {
|
||||
bail!("Connection timed out");
|
||||
}
|
||||
|
||||
pub fn kill(mut self) {
|
||||
pub fn kill(&mut self) {
|
||||
self.process.kill().unwrap();
|
||||
self.process.wait().unwrap();
|
||||
}
|
||||
@@ -191,16 +181,12 @@ pub trait PostgresClientExt: postgres::GenericClient {
|
||||
|
||||
impl<C: postgres::GenericClient> PostgresClientExt for C {}
|
||||
|
||||
pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> {
|
||||
fn generate_internal<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
|
||||
) -> Result<PgLsn> {
|
||||
client.execute("create extension if not exists neon_test_utils", &[])?;
|
||||
|
||||
let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0);
|
||||
ensure!(wal_keep_size == "50MB");
|
||||
let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0);
|
||||
ensure!(wal_writer_delay == "10s");
|
||||
let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0);
|
||||
ensure!(autovacuum == "off");
|
||||
|
||||
let wal_segment_size = client.query_one(
|
||||
"select cast(setting as bigint) as setting, unit \
|
||||
from pg_settings where name = 'wal_segment_size'",
|
||||
@@ -215,29 +201,13 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result
|
||||
"Unexpected wal_segment_size in bytes"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub trait Crafter {
|
||||
const NAME: &'static str;
|
||||
|
||||
/// Generates WAL using the client `client`. Returns the expected end-of-wal LSN.
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn>;
|
||||
}
|
||||
|
||||
fn craft_internal<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
|
||||
) -> Result<PgLsn> {
|
||||
ensure_server_config(client)?;
|
||||
|
||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
info!("LSN initial = {}", initial_lsn);
|
||||
|
||||
let last_lsn = match f(client, initial_lsn)? {
|
||||
None => client.pg_current_wal_insert_lsn()?,
|
||||
Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
|
||||
Ordering::Less => bail!("Some records were inserted after the generated WAL"),
|
||||
Ordering::Equal => last_lsn,
|
||||
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
||||
},
|
||||
@@ -246,116 +216,25 @@ fn craft_internal<C: postgres::GenericClient>(
|
||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
|
||||
Ordering::Less => bail!("Some records were flushed after the generated WAL"),
|
||||
Ordering::Equal => {}
|
||||
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
||||
}
|
||||
Ok(last_lsn)
|
||||
}
|
||||
|
||||
pub struct Simple;
|
||||
impl Crafter for Simple {
|
||||
const NAME: &'static str = "simple";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
craft_internal(client, |client, _| {
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
Ok(None)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LastWalRecordXlogSwitch;
|
||||
impl Crafter for LastWalRecordXlogSwitch {
|
||||
const NAME: &'static str = "last_wal_record_xlog_switch";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||
ensure_server_config(client)?;
|
||||
|
||||
pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
generate_internal(client, |client, _| {
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
after_xlog_switch <= next_segment,
|
||||
"XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
|
||||
after_xlog_switch,
|
||||
next_segment
|
||||
);
|
||||
Ok(next_segment)
|
||||
}
|
||||
Ok(None)
|
||||
})
|
||||
}
|
||||
|
||||
pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
|
||||
impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
|
||||
const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
// Do not use generate_internal because here we end up with flush_lsn exactly on
|
||||
// the segment boundary and insert_lsn after the initial page header, which is unusual.
|
||||
ensure_server_config(client)?;
|
||||
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
|
||||
// Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
|
||||
// We will use logical message as the padding. We start with detecting how much WAL
|
||||
// it takes for one logical message, considering all alignments and headers.
|
||||
let base_wal_advance = {
|
||||
let before_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
// Small non-empty message bigger than few bytes is more likely than an empty
|
||||
// message to have the same format as the big padding message.
|
||||
client.execute(
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
|
||||
&[],
|
||||
)?;
|
||||
// The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
|
||||
(u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
|
||||
+ XLOG_SIZE_OF_XLOG_RECORD
|
||||
};
|
||||
let mut remaining_lsn =
|
||||
XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
|
||||
if remaining_lsn < base_wal_advance {
|
||||
remaining_lsn += XLOG_BLCKSZ;
|
||||
}
|
||||
let repeats = 10 + remaining_lsn - base_wal_advance;
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
remaining_lsn,
|
||||
base_wal_advance,
|
||||
repeats
|
||||
);
|
||||
client.execute(
|
||||
"SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
|
||||
&[&(repeats as i32)],
|
||||
)?;
|
||||
info!(
|
||||
"current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
|
||||
client.pg_current_wal_insert_lsn()?,
|
||||
XLOG_SIZE_OF_XLOG_RECORD
|
||||
);
|
||||
|
||||
// Emit the XLOG_SWITCH
|
||||
let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
|
||||
let next_segment = PgLsn::from(0x0200_0000);
|
||||
ensure!(
|
||||
after_xlog_switch < next_segment,
|
||||
"XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
|
||||
after_xlog_switch,
|
||||
next_segment
|
||||
);
|
||||
ensure!(
|
||||
u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
|
||||
"XLOG_SWITCH message ended not on page boundary: {}",
|
||||
after_xlog_switch
|
||||
);
|
||||
Ok(next_segment)
|
||||
}
|
||||
}
|
||||
|
||||
fn craft_single_logical_message(
|
||||
fn generate_single_logical_message(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
transactional: bool,
|
||||
) -> Result<PgLsn> {
|
||||
craft_internal(client, |client, initial_lsn| {
|
||||
generate_internal(client, |client, initial_lsn| {
|
||||
ensure!(
|
||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||
"Initial LSN is too far in the future"
|
||||
@@ -393,18 +272,14 @@ fn craft_single_logical_message(
|
||||
})
|
||||
}
|
||||
|
||||
pub struct WalRecordCrossingSegmentFollowedBySmallOne;
|
||||
impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
|
||||
const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
craft_single_logical_message(client, true)
|
||||
}
|
||||
pub fn generate_wal_record_crossing_segment_followed_by_small_one(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
) -> Result<PgLsn> {
|
||||
generate_single_logical_message(client, true)
|
||||
}
|
||||
|
||||
pub struct LastWalRecordCrossingSegment;
|
||||
impl Crafter for LastWalRecordCrossingSegment {
|
||||
const NAME: &'static str = "last_wal_record_crossing_segment";
|
||||
fn craft(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
craft_single_logical_message(client, false)
|
||||
}
|
||||
pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
) -> Result<PgLsn> {
|
||||
generate_single_logical_message(client, false)
|
||||
}
|
||||
@@ -12,10 +12,8 @@ use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
ffi::OsStr,
|
||||
fmt::Debug,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
@@ -42,19 +40,13 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
pub trait RemoteObjectName {
|
||||
// Needed to retrieve last component for RemoteObjectId.
|
||||
// In other words a file name
|
||||
fn object_name(&self) -> Option<&str>;
|
||||
}
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type RemoteObjectId: RemoteObjectName;
|
||||
type RemoteObjectId;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
||||
@@ -65,12 +57,6 @@ pub trait RemoteStorage: Send + Sync {
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Lists all top level subdirectories for a given prefix
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
@@ -84,7 +70,11 @@ pub trait RemoteStorage: Send + Sync {
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError>;
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
@@ -93,49 +83,12 @@ pub trait RemoteStorage: Send + Sync {
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError>;
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
pub struct Download {
|
||||
pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
|
||||
/// Extra key-value data, associated with the current remote file.
|
||||
pub metadata: Option<StorageMetadata>,
|
||||
}
|
||||
|
||||
impl Debug for Download {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Download")
|
||||
.field("metadata", &self.metadata)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum DownloadError {
|
||||
/// Validation or other error happened due to user input.
|
||||
BadInput(anyhow::Error),
|
||||
/// The file was not found in the remote storage.
|
||||
NotFound,
|
||||
/// The file was found in the remote storage, but the download failed.
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DownloadError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
DownloadError::BadInput(e) => {
|
||||
write!(f, "Failed to download a remote file due to user input: {e}")
|
||||
}
|
||||
DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
|
||||
DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::error::Error for DownloadError {}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
pub enum GenericRemoteStorage {
|
||||
@@ -227,7 +180,7 @@ pub struct S3Config {
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
impl Debug for S3Config {
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
//! volume is mounted to the local FS.
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
future::Future,
|
||||
path::{Path, PathBuf},
|
||||
pin::Pin,
|
||||
@@ -18,16 +17,10 @@ use tokio::{
|
||||
};
|
||||
use tracing::*;
|
||||
|
||||
use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName};
|
||||
use crate::path_with_suffix_extension;
|
||||
|
||||
use super::{strip_path_prefix, RemoteStorage, StorageMetadata};
|
||||
|
||||
impl RemoteObjectName for PathBuf {
|
||||
fn object_name(&self) -> Option<&str> {
|
||||
self.file_stem().and_then(|n| n.to_str())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LocalFs {
|
||||
working_directory: PathBuf,
|
||||
storage_root: PathBuf,
|
||||
@@ -108,18 +101,7 @@ impl RemoteStorage for LocalFs {
|
||||
}
|
||||
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
get_all_files(&self.storage_root, true).await
|
||||
}
|
||||
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
let path = match prefix {
|
||||
Some(prefix) => Cow::Owned(self.storage_root.join(prefix)),
|
||||
None => Cow::Borrowed(&self.storage_root),
|
||||
};
|
||||
get_all_files(path.as_ref(), false).await
|
||||
get_all_files(&self.storage_root).await
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
@@ -210,56 +192,14 @@ impl RemoteStorage for LocalFs {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
|
||||
let file_path = self
|
||||
.resolve_in_storage(from)
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
|
||||
let source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
);
|
||||
|
||||
let metadata = self
|
||||
.read_storage_metadata(&file_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
Ok(Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source),
|
||||
})
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError> {
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
if end_exclusive <= start_inclusive {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})")));
|
||||
};
|
||||
if start_inclusive == end_exclusive.saturating_sub(1) {
|
||||
return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
|
||||
}
|
||||
}
|
||||
let file_path = self
|
||||
.resolve_in_storage(from)
|
||||
.map_err(DownloadError::BadInput)?;
|
||||
if file_exists(&file_path).map_err(DownloadError::BadInput)? {
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
@@ -270,31 +210,81 @@ impl RemoteStorage for LocalFs {
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})
|
||||
.map_err(DownloadError::Other)?,
|
||||
})?,
|
||||
);
|
||||
io::copy(&mut source, to).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to download file '{}' from the local storage",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
source.flush().await?;
|
||||
|
||||
self.read_storage_metadata(&file_path).await
|
||||
} else {
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
if let Some(end_exclusive) = end_exclusive {
|
||||
ensure!(
|
||||
end_exclusive > start_inclusive,
|
||||
"Invalid range, start ({}) is bigger then end ({:?})",
|
||||
start_inclusive,
|
||||
end_exclusive
|
||||
);
|
||||
if start_inclusive == end_exclusive.saturating_sub(1) {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let file_path = self.resolve_in_storage(from)?;
|
||||
|
||||
if file_path.exists() && file_path.is_file() {
|
||||
let mut source = io::BufReader::new(
|
||||
fs::OpenOptions::new()
|
||||
.read(true)
|
||||
.open(&file_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open source file '{}' to use in the download",
|
||||
file_path.display()
|
||||
)
|
||||
})?,
|
||||
);
|
||||
source
|
||||
.seek(io::SeekFrom::Start(start_inclusive))
|
||||
.await
|
||||
.context("Failed to seek to the range start in a local storage file")
|
||||
.map_err(DownloadError::Other)?;
|
||||
let metadata = self
|
||||
.read_storage_metadata(&file_path)
|
||||
.await
|
||||
.map_err(DownloadError::Other)?;
|
||||
.context("Failed to seek to the range start in a local storage file")?;
|
||||
match end_exclusive {
|
||||
Some(end_exclusive) => {
|
||||
io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
|
||||
}
|
||||
None => io::copy(&mut source, to).await,
|
||||
}
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to download file '{}' range from the local storage",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(match end_exclusive {
|
||||
Some(end_exclusive) => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
|
||||
},
|
||||
None => Download {
|
||||
metadata,
|
||||
download_stream: Box::pin(source),
|
||||
},
|
||||
})
|
||||
self.read_storage_metadata(&file_path).await
|
||||
} else {
|
||||
Err(DownloadError::NotFound)
|
||||
bail!(
|
||||
"File '{}' either does not exist or is not a file",
|
||||
file_path.display()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -317,7 +307,6 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {
|
||||
|
||||
fn get_all_files<'a, P>(
|
||||
directory_path: P,
|
||||
recursive: bool,
|
||||
) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
|
||||
where
|
||||
P: AsRef<Path> + Send + Sync + 'a,
|
||||
@@ -334,11 +323,7 @@ where
|
||||
if file_type.is_symlink() {
|
||||
debug!("{:?} us a symlink, skipping", entry_path)
|
||||
} else if file_type.is_dir() {
|
||||
if recursive {
|
||||
paths.extend(get_all_files(entry_path, true).await?.into_iter())
|
||||
} else {
|
||||
paths.push(dir_entry.path())
|
||||
}
|
||||
paths.extend(get_all_files(entry_path).await?.into_iter())
|
||||
} else {
|
||||
paths.push(dir_entry.path());
|
||||
}
|
||||
@@ -367,19 +352,6 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
|
||||
if file_path.exists() {
|
||||
ensure!(
|
||||
file_path.is_file(),
|
||||
"file path '{}' is not a file",
|
||||
file_path.display()
|
||||
);
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod pure_tests {
|
||||
use tempfile::tempdir;
|
||||
@@ -546,31 +518,6 @@ mod fs_tests {
|
||||
use std::{collections::HashMap, io::Write};
|
||||
use tempfile::tempdir;
|
||||
|
||||
async fn read_and_assert_remote_file_contents(
|
||||
storage: &LocalFs,
|
||||
#[allow(clippy::ptr_arg)]
|
||||
// have to use &PathBuf due to `storage.local_path` parameter requirements
|
||||
remote_storage_path: &PathBuf,
|
||||
expected_metadata: Option<&StorageMetadata>,
|
||||
) -> anyhow::Result<String> {
|
||||
let mut download = storage
|
||||
.download(remote_storage_path)
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
|
||||
ensure!(
|
||||
download.metadata.as_ref() == expected_metadata,
|
||||
"Unexpected metadata returned for the downloaded file"
|
||||
);
|
||||
|
||||
let mut contents = String::new();
|
||||
download
|
||||
.download_stream
|
||||
.read_to_string(&mut contents)
|
||||
.await
|
||||
.context("Failed to read remote file contents into string")?;
|
||||
Ok(contents)
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn upload_file() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
@@ -621,7 +568,15 @@ mod fs_tests {
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
content_bytes.flush().await?;
|
||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
contents,
|
||||
@@ -629,9 +584,13 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage.download(&non_existing_path).await {
|
||||
Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
|
||||
other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
|
||||
match storage.download(&non_existing_path, &mut io::sink()).await {
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage files"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -644,31 +603,58 @@ mod fs_tests {
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
|
||||
let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage
|
||||
.download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
|
||||
.await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
full_range_bytes.flush().await?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
full_range_download_contents,
|
||||
String::from_utf8(full_range_bytes.into_inner().into_inner())?,
|
||||
"Download full range should return the whole upload"
|
||||
);
|
||||
|
||||
let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let same_byte = 1_000_000_000;
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
same_byte,
|
||||
Some(same_byte + 1), // exclusive end
|
||||
&mut zero_range_bytes,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
zero_range_bytes.flush().await?;
|
||||
assert!(
|
||||
zero_range_bytes.into_inner().into_inner().is_empty(),
|
||||
"Zero byte range should not download any part of the file"
|
||||
);
|
||||
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut first_part_download = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
first_part_download.metadata.is_none(),
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut first_part_download.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
@@ -677,24 +663,20 @@ mod fs_tests {
|
||||
"First part bytes should be returned when requested"
|
||||
);
|
||||
|
||||
let mut second_part_download = storage
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
first_part_local.len() as u64,
|
||||
Some((first_part_local.len() + second_part_local.len()) as u64),
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
assert!(
|
||||
second_part_download.metadata.is_none(),
|
||||
metadata.is_none(),
|
||||
"No metadata should be returned for no metadata upload"
|
||||
);
|
||||
|
||||
let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut second_part_download.download_stream,
|
||||
&mut second_part_remote,
|
||||
)
|
||||
.await?;
|
||||
second_part_remote.flush().await?;
|
||||
let second_part_remote = second_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
@@ -714,30 +696,11 @@ mod fs_tests {
|
||||
let upload_name = "upload_1";
|
||||
let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;
|
||||
|
||||
let start = 1_000_000_000;
|
||||
let end = start + 1;
|
||||
match storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
start,
|
||||
Some(end), // exclusive end
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("zero bytes"));
|
||||
assert!(error_string.contains(&start.to_string()));
|
||||
assert!(error_string.contains(&end.to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
let start = 10000;
|
||||
let end = 234;
|
||||
assert!(start > end, "Should test an incorrect range");
|
||||
match storage
|
||||
.download_byte_range(&upload_target, start, Some(end))
|
||||
.download_byte_range(&upload_target, start, Some(end), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading wrong ranges"),
|
||||
@@ -749,6 +712,18 @@ mod fs_tests {
|
||||
}
|
||||
}
|
||||
|
||||
let non_existing_path = PathBuf::from("somewhere").join("else");
|
||||
match storage
|
||||
.download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
|
||||
.await
|
||||
{
|
||||
Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
|
||||
Err(e) => {
|
||||
let error_string = e.to_string();
|
||||
assert!(error_string.contains("does not exist"));
|
||||
assert!(error_string.contains(&non_existing_path.display().to_string()));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -787,26 +762,35 @@ mod fs_tests {
|
||||
let upload_target =
|
||||
upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;
|
||||
|
||||
let full_range_download_contents =
|
||||
read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
|
||||
let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
|
||||
|
||||
content_bytes.flush().await?;
|
||||
let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
|
||||
assert_eq!(
|
||||
dummy_contents(upload_name),
|
||||
full_range_download_contents,
|
||||
contents,
|
||||
"We should upload and download the same contents"
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
full_download_metadata.as_ref(),
|
||||
Some(&metadata),
|
||||
"We should get the same metadata back for full download"
|
||||
);
|
||||
|
||||
let uploaded_bytes = dummy_contents(upload_name).into_bytes();
|
||||
let (first_part_local, _) = uploaded_bytes.split_at(3);
|
||||
|
||||
let mut partial_download_with_metadata = storage
|
||||
.download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
|
||||
.await?;
|
||||
let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
|
||||
io::copy(
|
||||
&mut partial_download_with_metadata.download_stream,
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
let partial_download_metadata = storage
|
||||
.download_byte_range(
|
||||
&upload_target,
|
||||
0,
|
||||
Some(first_part_local.len() as u64),
|
||||
&mut first_part_remote,
|
||||
)
|
||||
.await?;
|
||||
first_part_remote.flush().await?;
|
||||
let first_part_remote = first_part_remote.into_inner().into_inner();
|
||||
assert_eq!(
|
||||
@@ -816,8 +800,8 @@ mod fs_tests {
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
partial_download_with_metadata.metadata,
|
||||
Some(metadata),
|
||||
partial_download_metadata.as_ref(),
|
||||
Some(&metadata),
|
||||
"We should get the same metadata back for partial download"
|
||||
);
|
||||
|
||||
@@ -859,7 +843,7 @@ mod fs_tests {
|
||||
}
|
||||
|
||||
fn dummy_contents(name: &str) -> String {
|
||||
format!("contents for {name}")
|
||||
format!("contents for {}", name)
|
||||
}
|
||||
|
||||
async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
|
||||
|
||||
@@ -9,19 +9,17 @@ use std::path::{Path, PathBuf};
|
||||
use anyhow::Context;
|
||||
use rusoto_core::{
|
||||
credential::{InstanceMetadataProvider, StaticProvider},
|
||||
HttpClient, Region, RusotoError,
|
||||
HttpClient, Region,
|
||||
};
|
||||
use rusoto_s3::{
|
||||
DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest,
|
||||
S3Client, StreamingBody, S3,
|
||||
DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
|
||||
StreamingBody, S3,
|
||||
};
|
||||
use tokio::{io, sync::Semaphore};
|
||||
use tokio_util::io::ReaderStream;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::{
|
||||
strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config,
|
||||
};
|
||||
use crate::{strip_path_prefix, RemoteStorage, S3Config};
|
||||
|
||||
use super::StorageMetadata;
|
||||
|
||||
@@ -119,25 +117,6 @@ impl S3ObjectKey {
|
||||
}
|
||||
}
|
||||
|
||||
impl RemoteObjectName for S3ObjectKey {
|
||||
/// Turn a/b/c or a/b/c/ into c
|
||||
fn object_name(&self) -> Option<&str> {
|
||||
// corner case, char::to_string is not const, thats why this is more verbose than it needs to be
|
||||
// see https://github.com/rust-lang/rust/issues/88674
|
||||
if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
|
||||
return None;
|
||||
}
|
||||
|
||||
if self.0.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
|
||||
} else {
|
||||
self.0
|
||||
.rsplit_once(S3_PREFIX_SEPARATOR)
|
||||
.map(|(_, last)| last)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// AWS S3 storage.
|
||||
pub struct S3Bucket {
|
||||
workdir: PathBuf,
|
||||
@@ -208,39 +187,6 @@ impl S3Bucket {
|
||||
concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
|
||||
})
|
||||
}
|
||||
|
||||
async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
metrics::inc_get_object();
|
||||
|
||||
match self.client.get_object(request).await {
|
||||
Ok(object_output) => match object_output.body {
|
||||
None => {
|
||||
metrics::inc_get_object_fail();
|
||||
Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Got no body for the S3 object given"
|
||||
)))
|
||||
}
|
||||
Some(body) => Ok(Download {
|
||||
metadata: object_output.metadata.map(StorageMetadata),
|
||||
download_stream: Box::pin(io::BufReader::new(body.into_async_read())),
|
||||
}),
|
||||
},
|
||||
Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound),
|
||||
Err(e) => {
|
||||
metrics::inc_get_object_fail();
|
||||
Err(DownloadError::Other(anyhow::anyhow!(
|
||||
"Failed to download S3 object: {e}"
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait::async_trait]
|
||||
@@ -304,77 +250,6 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
/// Note: it wont include empty "directories"
|
||||
async fn list_prefixes(
|
||||
&self,
|
||||
prefix: Option<Self::RemoteObjectId>,
|
||||
) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
|
||||
let list_prefix = match prefix {
|
||||
Some(prefix) => {
|
||||
let mut prefix_in_bucket = self.prefix_in_bucket.clone().unwrap_or_default();
|
||||
// if there is no trailing / in default prefix and
|
||||
// supplied prefix does not start with "/" insert it
|
||||
if !(prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR)
|
||||
|| prefix.0.starts_with(S3_PREFIX_SEPARATOR))
|
||||
{
|
||||
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
|
||||
}
|
||||
|
||||
prefix_in_bucket.push_str(&prefix.0);
|
||||
// required to end with a separator
|
||||
// otherwise request will return only the entry of a prefix
|
||||
if !prefix_in_bucket.ends_with(S3_PREFIX_SEPARATOR) {
|
||||
prefix_in_bucket.push(S3_PREFIX_SEPARATOR);
|
||||
}
|
||||
Some(prefix_in_bucket)
|
||||
}
|
||||
None => self.prefix_in_bucket.clone(),
|
||||
};
|
||||
|
||||
let mut document_keys = Vec::new();
|
||||
|
||||
let mut continuation_token = None;
|
||||
loop {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 list")?;
|
||||
|
||||
metrics::inc_list_objects();
|
||||
|
||||
let fetch_response = self
|
||||
.client
|
||||
.list_objects_v2(ListObjectsV2Request {
|
||||
bucket: self.bucket_name.clone(),
|
||||
prefix: list_prefix.clone(),
|
||||
continuation_token,
|
||||
delimiter: Some(S3_PREFIX_SEPARATOR.to_string()),
|
||||
..ListObjectsV2Request::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_list_objects_fail();
|
||||
e
|
||||
})?;
|
||||
|
||||
document_keys.extend(
|
||||
fetch_response
|
||||
.common_prefixes
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|o| Some(S3ObjectKey(o.prefix?))),
|
||||
);
|
||||
|
||||
match fetch_response.continuation_token {
|
||||
Some(new_token) => continuation_token = Some(new_token),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
Ok(document_keys)
|
||||
}
|
||||
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
@@ -408,13 +283,38 @@ impl RemoteStorage for S3Bucket {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: from.key().to_owned(),
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 download")?;
|
||||
|
||||
metrics::inc_get_object();
|
||||
|
||||
let object_output = self
|
||||
.client
|
||||
.get_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: from.key().to_owned(),
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_get_object_fail();
|
||||
e
|
||||
})?;
|
||||
|
||||
if let Some(body) = object_output.body {
|
||||
let mut from = io::BufReader::new(body.into_async_read());
|
||||
io::copy(&mut from, to).await?;
|
||||
}
|
||||
|
||||
Ok(object_output.metadata.map(StorageMetadata))
|
||||
}
|
||||
|
||||
async fn download_byte_range(
|
||||
@@ -422,7 +322,8 @@ impl RemoteStorage for S3Bucket {
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
) -> Result<Download, DownloadError> {
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>> {
|
||||
// S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
|
||||
// and needs both ends to be exclusive
|
||||
let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
|
||||
@@ -430,14 +331,34 @@ impl RemoteStorage for S3Bucket {
|
||||
Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
|
||||
None => format!("bytes={}-", start_inclusive),
|
||||
});
|
||||
let _guard = self
|
||||
.concurrency_limiter
|
||||
.acquire()
|
||||
.await
|
||||
.context("Concurrency limiter semaphore got closed during S3 range download")?;
|
||||
|
||||
self.download_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: from.key().to_owned(),
|
||||
range,
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
metrics::inc_get_object();
|
||||
|
||||
let object_output = self
|
||||
.client
|
||||
.get_object(GetObjectRequest {
|
||||
bucket: self.bucket_name.clone(),
|
||||
key: from.key().to_owned(),
|
||||
range,
|
||||
..GetObjectRequest::default()
|
||||
})
|
||||
.await
|
||||
.map_err(|e| {
|
||||
metrics::inc_get_object_fail();
|
||||
e
|
||||
})?;
|
||||
|
||||
if let Some(body) = object_output.body {
|
||||
let mut from = io::BufReader::new(body.into_async_read());
|
||||
io::copy(&mut from, to).await?;
|
||||
}
|
||||
|
||||
Ok(object_output.metadata.map(StorageMetadata))
|
||||
}
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
|
||||
@@ -470,25 +391,6 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn object_name() {
|
||||
let k = S3ObjectKey("a/b/c".to_owned());
|
||||
assert_eq!(k.object_name(), Some("c"));
|
||||
|
||||
let k = S3ObjectKey("a/b/c/".to_owned());
|
||||
assert_eq!(k.object_name(), Some("c"));
|
||||
|
||||
let k = S3ObjectKey("a/".to_owned());
|
||||
assert_eq!(k.object_name(), Some("a"));
|
||||
|
||||
// XXX is it impossible to have an empty key?
|
||||
let k = S3ObjectKey("".to_owned());
|
||||
assert_eq!(k.object_name(), None);
|
||||
|
||||
let k = S3ObjectKey("/".to_owned());
|
||||
assert_eq!(k.object_name(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn download_destination() -> anyhow::Result<()> {
|
||||
let workdir = tempdir()?.path().to_owned();
|
||||
|
||||
@@ -537,13 +537,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
|
||||
match tenant_match.subcommand() {
|
||||
Some(("list", _)) => {
|
||||
for t in pageserver.tenant_list()? {
|
||||
println!(
|
||||
"{} {}",
|
||||
t.id,
|
||||
t.state
|
||||
.map(|s| s.to_string())
|
||||
.unwrap_or_else(|| String::from(""))
|
||||
);
|
||||
println!("{} {}", t.id, t.state);
|
||||
}
|
||||
}
|
||||
Some(("create", create_match)) => {
|
||||
|
||||
@@ -60,7 +60,6 @@ where
|
||||
write: W,
|
||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||
req_lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
) -> Result<Basebackup<'a, W>> {
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
@@ -97,26 +96,16 @@ where
|
||||
(end_of_timeline.prev, end_of_timeline.last)
|
||||
};
|
||||
|
||||
// Consolidate the derived and the provided prev_lsn values
|
||||
let prev_lsn = if let Some(provided_prev_lsn) = prev_lsn {
|
||||
if backup_prev != Lsn(0) {
|
||||
ensure!(backup_prev == provided_prev_lsn)
|
||||
}
|
||||
provided_prev_lsn
|
||||
} else {
|
||||
backup_prev
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||
backup_lsn, prev_lsn, full_backup
|
||||
backup_lsn, backup_prev, full_backup
|
||||
);
|
||||
|
||||
Ok(Basebackup {
|
||||
ar: Builder::new(AbortableWrite::new(write)),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: prev_lsn,
|
||||
prev_record_lsn: backup_prev,
|
||||
full_backup,
|
||||
finished: false,
|
||||
})
|
||||
|
||||
@@ -263,8 +263,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
// start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
|
||||
pageserver::tenant_tasks::init_tenant_task_pool()?;
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
AuthType::Trust | AuthType::MD5 => None,
|
||||
|
||||
@@ -22,49 +22,6 @@ paths:
|
||||
properties:
|
||||
id:
|
||||
type: integer
|
||||
|
||||
/v1/tenant/{tenant_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: Get tenant status
|
||||
responses:
|
||||
"200":
|
||||
description: Currently returns the flag whether the tenant has inprogress timeline downloads
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -113,7 +70,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -128,14 +84,13 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
get:
|
||||
description: Get info about the timeline
|
||||
parameters:
|
||||
- name: include-non-incremental-logical-size
|
||||
in: query
|
||||
schema:
|
||||
type: string
|
||||
description: Controls calculation of current_logical_size_non_incremental
|
||||
responses:
|
||||
"200":
|
||||
description: TimelineInfo
|
||||
@@ -167,35 +122,6 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
delete:
|
||||
description: "Attempts to delete specified timeline. On 500 errors should be retried"
|
||||
responses:
|
||||
"200":
|
||||
description: Ok
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
|
||||
parameters:
|
||||
@@ -245,7 +171,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/attach:
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/attach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
@@ -253,13 +179,19 @@ paths:
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Schedules attach operation to happen in the background for given tenant
|
||||
description: Attach remote timeline
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant attaching scheduled
|
||||
"200":
|
||||
description: Timeline attaching scheduled
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -283,7 +215,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Tenant download is already in progress
|
||||
description: Timeline download is already in progress
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
@@ -295,6 +227,7 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
@@ -310,11 +243,10 @@ paths:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Deprecated, use DELETE /v1/tenant/{tenant_id}/timeline/{timeline_id} instead
|
||||
deprecated: true
|
||||
description: Detach local timeline
|
||||
responses:
|
||||
"200":
|
||||
description: Ok
|
||||
description: Timeline detached
|
||||
"400":
|
||||
description: Error when no tenant id found in path or no timeline id
|
||||
content:
|
||||
@@ -340,43 +272,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/detach:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
post:
|
||||
description: Detach local tenant
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant detached
|
||||
"400":
|
||||
description: Error when no tenant id found in path parameters
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/:
|
||||
parameters:
|
||||
@@ -572,13 +467,12 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- id
|
||||
- state
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
state:
|
||||
type: string
|
||||
has_in_progress_downloads:
|
||||
type: boolean
|
||||
TenantCreateInfo:
|
||||
type: object
|
||||
properties:
|
||||
@@ -673,7 +567,6 @@ components:
|
||||
type: integer
|
||||
current_logical_size_non_incremental:
|
||||
type: integer
|
||||
|
||||
WalReceiverEntry:
|
||||
type: object
|
||||
required:
|
||||
|
||||
@@ -14,7 +14,6 @@ use crate::repository::Repository;
|
||||
use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::tenant_mgr::TenantInfo;
|
||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||
use utils::{
|
||||
@@ -210,9 +209,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
.await;
|
||||
|
||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||
return Err(ApiError::NotFound(format!(
|
||||
"Timeline {tenant_id}/{timeline_id} is not found neither locally nor remotely"
|
||||
)));
|
||||
return Err(ApiError::NotFound(
|
||||
"Timeline is not found neither locally nor remotely".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
let timeline_info = TimelineInfo {
|
||||
@@ -242,157 +241,123 @@ async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Bod
|
||||
json_response(StatusCode::OK, &wal_receiver_entry)
|
||||
}
|
||||
|
||||
// TODO makes sense to provide tenant config right away the same way as it handled in tenant_create
|
||||
async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
info!("Handling tenant attach {}", tenant_id,);
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
info!(
|
||||
"Handling timeline {} attach for tenant: {}",
|
||||
timeline_id, tenant_id,
|
||||
);
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_tenant_state(tenant_id).is_some() {
|
||||
anyhow::bail!("Tenant is already present locally")
|
||||
if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() {
|
||||
// TODO: maybe answer with 309 Not Modified here?
|
||||
anyhow::bail!("Timeline is already present locally")
|
||||
};
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let sync_id = ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
if let Some(tenant_entry) = index_accessor.tenant_entry_mut(&tenant_id) {
|
||||
if tenant_entry.has_in_progress_downloads() {
|
||||
if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Tenant download is already in progress".to_string(),
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
for (timeline_id, remote_timeline) in tenant_entry.iter_mut() {
|
||||
storage_sync::schedule_layer_download(tenant_id, *timeline_id);
|
||||
remote_timeline.awaits_download = true;
|
||||
}
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
}
|
||||
// no tenant in the index, release the lock to make the potentially lengthy download opetation
|
||||
drop(index_accessor);
|
||||
|
||||
// download index parts for every tenant timeline
|
||||
let remote_timelines = match gather_tenant_timelines_index_parts(state, tenant_id).await {
|
||||
Ok(Some(remote_timelines)) => remote_timelines,
|
||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote tenant".to_string())),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote tenant data: {:?}", e);
|
||||
return Err(ApiError::NotFound(
|
||||
"Failed to retrieve remote tenant".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
// recheck that download is not in progress because
|
||||
// we've released the lock to avoid holding it during the download
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
let tenant_entry = match index_accessor.tenant_entry_mut(&tenant_id) {
|
||||
Some(tenant_entry) => {
|
||||
if tenant_entry.has_in_progress_downloads() {
|
||||
return Err(ApiError::Conflict(
|
||||
"Tenant download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
tenant_entry
|
||||
}
|
||||
None => index_accessor.add_tenant_entry(tenant_id),
|
||||
};
|
||||
|
||||
// populate remote index with the data from index part and create directories on the local filesystem
|
||||
for (timeline_id, mut remote_timeline) in remote_timelines {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
|
||||
remote_timeline.awaits_download = true;
|
||||
tenant_entry.insert(timeline_id, remote_timeline);
|
||||
// schedule actual download
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
} else {
|
||||
// no timeline in the index, release the lock to make the potentially lengthy download opetation
|
||||
drop(index_accessor);
|
||||
}
|
||||
|
||||
let new_timeline = match try_download_index_part_data(state, sync_id).await {
|
||||
Ok(Some(mut new_timeline)) => {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
new_timeline.awaits_download = true;
|
||||
new_timeline
|
||||
}
|
||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote timeline data: {:?}", e);
|
||||
return Err(ApiError::NotFound(
|
||||
"Failed to retrieve remote timeline".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
match index_accessor.timeline_entry_mut(&sync_id) {
|
||||
Some(remote_timeline) => {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
remote_timeline.awaits_download = true;
|
||||
}
|
||||
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
|
||||
}
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
/// Note: is expensive from s3 access perspective,
|
||||
/// for details see comment to `storage_sync::gather_tenant_timelines_index_parts`
|
||||
async fn gather_tenant_timelines_index_parts(
|
||||
async fn try_download_index_part_data(
|
||||
state: &State,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<Option<Vec<(ZTimelineId, RemoteTimeline)>>> {
|
||||
let index_parts = match state.remote_storage.as_ref() {
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<Option<RemoteTimeline>> {
|
||||
let index_part = match state.remote_storage.as_ref() {
|
||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||
storage_sync::gather_tenant_timelines_index_parts(state.conf, local_storage, tenant_id)
|
||||
.await
|
||||
storage_sync::download_index_part(state.conf, local_storage, sync_id).await
|
||||
}
|
||||
// FIXME here s3 storage contains its own limits, that are separate from sync storage thread ones
|
||||
// because it is a different instance. We can move this limit to some global static
|
||||
// or use one instance everywhere.
|
||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||
storage_sync::gather_tenant_timelines_index_parts(state.conf, s3_storage, tenant_id)
|
||||
.await
|
||||
storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
.with_context(|| format!("Failed to download index parts for tenant {tenant_id}"))?;
|
||||
.with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;
|
||||
|
||||
let mut remote_timelines = Vec::with_capacity(index_parts.len());
|
||||
for (timeline_id, index_part) in index_parts {
|
||||
let timeline_path = state.conf.timeline_path(&timeline_id, &tenant_id);
|
||||
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.with_context(|| {
|
||||
format!("Failed to convert index part into remote timeline for timeline {tenant_id}/{timeline_id}")
|
||||
})?;
|
||||
remote_timelines.push((timeline_id, remote_timeline));
|
||||
}
|
||||
Ok(Some(remote_timelines))
|
||||
let timeline_path = state
|
||||
.conf
|
||||
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.map(Some)
|
||||
.with_context(|| {
|
||||
format!("Failed to convert index part into remote timeline for timeline {sync_id}")
|
||||
})
|
||||
}
|
||||
|
||||
async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let state = get_state(&request);
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
|
||||
tenant_mgr::delete_timeline(tenant_id, timeline_id)
|
||||
let _enter =
|
||||
info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
|
||||
.entered();
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_timeline_entry(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_detach_handler", tenant = %tenant_id).entered();
|
||||
tenant_mgr::detach_tenant(conf, tenant_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let mut remote_index = state.remote_index.write().await;
|
||||
remote_index.remove_tenant_entry(&tenant_id);
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
@@ -400,13 +365,9 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
// clone to avoid holding the lock while awaiting for blocking task
|
||||
let remote_index = state.remote_index.read().await.clone();
|
||||
|
||||
let response_data = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_list").entered();
|
||||
crate::tenant_mgr::list_tenants(&remote_index)
|
||||
crate::tenant_mgr::list_tenants()
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
@@ -414,34 +375,6 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
|
||||
async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
// if tenant is in progress of downloading it can be absent in global tenant map
|
||||
let tenant_state = tokio::task::spawn_blocking(move || tenant_mgr::get_tenant_state(tenant_id))
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
|
||||
let index_accessor = remote_index.read().await;
|
||||
let has_in_progress_downloads = index_accessor
|
||||
.tenant_entry(&tenant_id)
|
||||
.ok_or_else(|| ApiError::NotFound("Tenant not found in remote index".to_string()))?
|
||||
.has_in_progress_downloads();
|
||||
|
||||
json_response(
|
||||
StatusCode::OK,
|
||||
TenantInfo {
|
||||
id: tenant_id,
|
||||
state: tenant_state,
|
||||
has_in_progress_downloads: Some(has_in_progress_downloads),
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
// check for management permission
|
||||
check_permission(&request, None)?;
|
||||
@@ -587,28 +520,24 @@ pub fn make_router(
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.get("/v1/tenant/:tenant_id", tenant_status)
|
||||
.put("/v1/tenant/config", tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.post("/v1/tenant/:tenant_id/attach", tenant_attach_handler)
|
||||
.post("/v1/tenant/:tenant_id/detach", tenant_detach_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
)
|
||||
.delete(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_delete_handler,
|
||||
)
|
||||
// for backward compatibility
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
||||
timeline_delete_handler,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
|
||||
wal_receiver_get_handler,
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
|
||||
timeline_attach_handler,
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
||||
timeline_detach_handler,
|
||||
)
|
||||
.any(handler_404))
|
||||
}
|
||||
|
||||
@@ -57,7 +57,6 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
||||
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
|
||||
pg_control = Some(control_file);
|
||||
}
|
||||
modification.flush()?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,7 +317,6 @@ pub fn import_basebackup_from_tar<R: Repository, Reader: Read>(
|
||||
// We found the pg_control file.
|
||||
pg_control = Some(res);
|
||||
}
|
||||
modification.flush()?;
|
||||
}
|
||||
tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
@@ -518,23 +516,10 @@ pub fn import_file<R: Repository, Reader: Read>(
|
||||
// Parse zenith signal file to set correct previous LSN
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
// zenith.signal format is "PREV LSN: prev_lsn"
|
||||
// TODO write serialization and deserialization in the same place.
|
||||
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
|
||||
let prev_lsn = match zenith_signal {
|
||||
"PREV LSN: none" => Lsn(0),
|
||||
"PREV LSN: invalid" => Lsn(0),
|
||||
other => {
|
||||
let split = other.split(':').collect::<Vec<_>>();
|
||||
split[1]
|
||||
.trim()
|
||||
.parse::<Lsn>()
|
||||
.context("can't parse zenith.signal")?
|
||||
}
|
||||
};
|
||||
let zenith_signal = std::str::from_utf8(&bytes)?;
|
||||
let zenith_signal = zenith_signal.split(':').collect::<Vec<_>>();
|
||||
let prev_lsn = zenith_signal[1].trim().parse::<Lsn>()?;
|
||||
|
||||
// zenith.signal is not necessarily the last file, that we handle
|
||||
// but it is ok to call `finish_write()`, because final `modification.commit()`
|
||||
// will update lsn once more to the final one.
|
||||
let writer = modification.tline.tline.writer();
|
||||
writer.finish_write(prev_lsn);
|
||||
|
||||
|
||||
@@ -34,11 +34,13 @@ use std::time::{Duration, Instant, SystemTime};
|
||||
|
||||
use self::metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
use crate::repository::{GcResult, Repository, RepositoryTimeline, Timeline, TimelineWriter};
|
||||
use crate::repository::{
|
||||
GcResult, Repository, RepositoryTimeline, Timeline, TimelineSyncStatusUpdate, TimelineWriter,
|
||||
};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant_mgr;
|
||||
use crate::thread_mgr;
|
||||
@@ -156,18 +158,6 @@ pub struct LayeredRepository {
|
||||
// Global pageserver config parameters
|
||||
pub conf: &'static PageServerConf,
|
||||
|
||||
// Allows us to gracefully cancel operations that edit the directory
|
||||
// that backs this layered repository. Usage:
|
||||
//
|
||||
// Use `let _guard = file_lock.try_read()` while writing any files.
|
||||
// Use `let _guard = file_lock.write().unwrap()` to wait for all writes to finish.
|
||||
//
|
||||
// TODO try_read this lock during checkpoint as well to prevent race
|
||||
// between checkpoint and detach/delete.
|
||||
// TODO try_read this lock for all gc/compaction operations, not just
|
||||
// ones scheduled by the tenant task manager.
|
||||
pub file_lock: RwLock<()>,
|
||||
|
||||
// Overridden tenant-specific config parameters.
|
||||
// We keep TenantConfOpt sturct here to preserve the information
|
||||
// about parameters that are not set.
|
||||
@@ -230,32 +220,23 @@ impl Repository for LayeredRepository {
|
||||
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
timelineid: ZTimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Result<Arc<LayeredTimeline>> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
let vacant_timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(_) => bail!("Timeline already exists"),
|
||||
Entry::Vacant(vacant_entry) => vacant_entry,
|
||||
};
|
||||
|
||||
let timeline_path = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
if timeline_path.exists() {
|
||||
bail!("Timeline directory already exists, but timeline is missing in repository map. This is a bug.")
|
||||
}
|
||||
|
||||
// Create the timeline directory, and write initial metadata to file.
|
||||
crashsafe_dir::create_dir_all(timeline_path)?;
|
||||
crashsafe_dir::create_dir_all(self.conf.timeline_path(&timelineid, &self.tenant_id))?;
|
||||
|
||||
let metadata = TimelineMetadata::new(Lsn(0), None, None, Lsn(0), initdb_lsn, initdb_lsn);
|
||||
Self::save_metadata(self.conf, timeline_id, self.tenant_id, &metadata, true)?;
|
||||
Self::save_metadata(self.conf, timelineid, self.tenant_id, &metadata, true)?;
|
||||
|
||||
let timeline = LayeredTimeline::new(
|
||||
self.conf,
|
||||
Arc::clone(&self.tenant_conf),
|
||||
metadata,
|
||||
None,
|
||||
timeline_id,
|
||||
timelineid,
|
||||
self.tenant_id,
|
||||
Arc::clone(&self.walredo_mgr),
|
||||
self.upload_layers,
|
||||
@@ -264,18 +245,18 @@ impl Repository for LayeredRepository {
|
||||
|
||||
// Insert if not exists
|
||||
let timeline = Arc::new(timeline);
|
||||
vacant_timeline_entry.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)));
|
||||
match timelines.entry(timelineid) {
|
||||
Entry::Occupied(_) => bail!("Timeline already exists"),
|
||||
Entry::Vacant(vacant) => {
|
||||
vacant.insert(LayeredTimelineEntry::Loaded(Arc::clone(&timeline)))
|
||||
}
|
||||
};
|
||||
|
||||
Ok(timeline)
|
||||
}
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(
|
||||
&self,
|
||||
src: ZTimelineId,
|
||||
dst: ZTimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
) -> Result<()> {
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()> {
|
||||
// We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
|
||||
// about timelines, so otherwise a race condition is possible, where we create new timeline and GC
|
||||
// concurrently removes data that is needed by the new timeline.
|
||||
@@ -288,14 +269,6 @@ impl Repository for LayeredRepository {
|
||||
.context("failed to load timeline for branching")?
|
||||
.ok_or_else(|| anyhow::anyhow!("unknown timeline id: {}", &src))?;
|
||||
let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
|
||||
|
||||
// If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
|
||||
let start_lsn = start_lsn.unwrap_or_else(|| {
|
||||
let lsn = src_timeline.get_last_record_lsn();
|
||||
info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
|
||||
lsn
|
||||
});
|
||||
|
||||
src_timeline
|
||||
.check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
|
||||
.context("invalid branch start lsn")?;
|
||||
@@ -342,19 +315,19 @@ impl Repository for LayeredRepository {
|
||||
/// metrics collection.
|
||||
fn gc_iteration(
|
||||
&self,
|
||||
target_timeline_id: Option<ZTimelineId>,
|
||||
target_timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult> {
|
||||
let timeline_str = target_timeline_id
|
||||
let timeline_str = target_timelineid
|
||||
.map(|x| x.to_string())
|
||||
.unwrap_or_else(|| "-".to_string());
|
||||
|
||||
STORAGE_TIME
|
||||
.with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
|
||||
.observe_closure_duration(|| {
|
||||
self.gc_iteration_internal(target_timeline_id, horizon, pitr, checkpoint_before_gc)
|
||||
self.gc_iteration_internal(target_timelineid, horizon, pitr, checkpoint_before_gc)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -364,12 +337,16 @@ impl Repository for LayeredRepository {
|
||||
// compactions. We don't want to block everything else while the
|
||||
// compaction runs.
|
||||
let timelines = self.timelines.lock().unwrap();
|
||||
let timelines_to_compact = timelines
|
||||
let mut timelines_to_compact = timelines
|
||||
.iter()
|
||||
.map(|(timelineid, timeline)| (*timelineid, timeline.clone()))
|
||||
.collect::<Vec<_>>();
|
||||
drop(timelines);
|
||||
|
||||
// Sort to prevent deadlock
|
||||
timelines_to_compact.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
|
||||
// Compact all timelines in order
|
||||
for (timelineid, timeline) in &timelines_to_compact {
|
||||
let _entered =
|
||||
info_span!("compact", timeline = %timelineid, tenant = %self.tenant_id).entered();
|
||||
@@ -421,60 +398,50 @@ impl Repository for LayeredRepository {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
// in order to be retriable detach needs to be idempotent
|
||||
// (or at least to a point that each time the detach is called it can make progress)
|
||||
fn detach_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
let children_exist = timelines
|
||||
// check no child timelines, because detach will remove files, which will brake child branches
|
||||
// FIXME this can still be violated because we do not guarantee
|
||||
// that all ancestors are downloaded/attached to the same pageserver
|
||||
let num_children = timelines
|
||||
.iter()
|
||||
.any(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id));
|
||||
.filter(|(_, entry)| entry.ancestor_timeline_id() == Some(timeline_id))
|
||||
.count();
|
||||
|
||||
ensure!(
|
||||
!children_exist,
|
||||
num_children == 0,
|
||||
"Cannot detach timeline which has child timelines"
|
||||
);
|
||||
let timeline_entry = match timelines.entry(timeline_id) {
|
||||
Entry::Occupied(e) => e,
|
||||
Entry::Vacant(_) => bail!("timeline not found"),
|
||||
};
|
||||
|
||||
// try to acquire gc and compaction locks to prevent errors from missing files
|
||||
let _gc_guard = self
|
||||
.gc_cs
|
||||
.try_lock()
|
||||
.map_err(|e| anyhow::anyhow!("cannot acquire gc lock {e}"))?;
|
||||
|
||||
let compaction_guard = timeline_entry.get().compaction_guard()?;
|
||||
|
||||
let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
|
||||
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local timeline directory '{}'",
|
||||
local_timeline_directory.display()
|
||||
)
|
||||
})?;
|
||||
info!("detach removed files");
|
||||
|
||||
drop(compaction_guard);
|
||||
timeline_entry.remove();
|
||||
|
||||
ensure!(
|
||||
timelines.remove(&timeline_id).is_some(),
|
||||
"Cannot detach timeline {timeline_id} that is not available locally"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()> {
|
||||
debug!("attach timeline_id: {}", timeline_id,);
|
||||
match self.timelines.lock().unwrap().entry(timeline_id) {
|
||||
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
|
||||
Entry::Vacant(entry) => {
|
||||
// we need to get metadata of a timeline, another option is to pass it along with Downloaded status
|
||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
|
||||
// finally we make newly downloaded timeline visible to repository
|
||||
entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, })
|
||||
},
|
||||
};
|
||||
fn apply_timeline_remote_sync_status_update(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
timeline_sync_status_update: TimelineSyncStatusUpdate,
|
||||
) -> Result<()> {
|
||||
debug!(
|
||||
"apply_timeline_remote_sync_status_update timeline_id: {} update: {:?}",
|
||||
timeline_id, timeline_sync_status_update
|
||||
);
|
||||
match timeline_sync_status_update {
|
||||
TimelineSyncStatusUpdate::Downloaded => {
|
||||
match self.timelines.lock().unwrap().entry(timeline_id) {
|
||||
Entry::Occupied(_) => bail!("We completed a download for a timeline that already exists in repository. This is a bug."),
|
||||
Entry::Vacant(entry) => {
|
||||
// we need to get metadata of a timeline, another option is to pass it along with Downloaded status
|
||||
let metadata = load_metadata(self.conf, timeline_id, self.tenant_id).context("failed to load local metadata")?;
|
||||
// finally we make newly downloaded timeline visible to repository
|
||||
entry.insert(LayeredTimelineEntry::Unloaded { id: timeline_id, metadata, })
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -524,18 +491,6 @@ impl LayeredTimelineEntry {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn compaction_guard(&self) -> Result<Option<MutexGuard<()>>, anyhow::Error> {
|
||||
match self {
|
||||
LayeredTimelineEntry::Loaded(timeline) => timeline
|
||||
.compaction_cs
|
||||
.try_lock()
|
||||
.map_err(|e| anyhow::anyhow!("cannot lock compaction critical section {e}"))
|
||||
.map(Some),
|
||||
|
||||
LayeredTimelineEntry::Unloaded { .. } => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<LayeredTimelineEntry> for RepositoryTimeline<LayeredTimeline> {
|
||||
@@ -734,7 +689,6 @@ impl LayeredRepository {
|
||||
) -> LayeredRepository {
|
||||
LayeredRepository {
|
||||
tenant_id,
|
||||
file_lock: RwLock::new(()),
|
||||
conf,
|
||||
tenant_conf: Arc::new(RwLock::new(tenant_conf)),
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
@@ -872,13 +826,13 @@ impl LayeredRepository {
|
||||
// we do.
|
||||
fn gc_iteration_internal(
|
||||
&self,
|
||||
target_timeline_id: Option<ZTimelineId>,
|
||||
target_timelineid: Option<ZTimelineId>,
|
||||
horizon: u64,
|
||||
pitr: Duration,
|
||||
checkpoint_before_gc: bool,
|
||||
) -> Result<GcResult> {
|
||||
let _span_guard =
|
||||
info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timeline_id)
|
||||
info_span!("gc iteration", tenant = %self.tenant_id, timeline = ?target_timelineid)
|
||||
.entered();
|
||||
let mut totals: GcResult = Default::default();
|
||||
let now = Instant::now();
|
||||
@@ -892,12 +846,6 @@ impl LayeredRepository {
|
||||
let mut timeline_ids = Vec::new();
|
||||
let mut timelines = self.timelines.lock().unwrap();
|
||||
|
||||
if let Some(target_timeline_id) = target_timeline_id.as_ref() {
|
||||
if timelines.get(target_timeline_id).is_none() {
|
||||
bail!("gc target timeline does not exist")
|
||||
}
|
||||
};
|
||||
|
||||
for (timeline_id, timeline_entry) in timelines.iter() {
|
||||
timeline_ids.push(*timeline_id);
|
||||
|
||||
@@ -906,7 +854,7 @@ impl LayeredRepository {
|
||||
// Somewhat related: https://github.com/zenithdb/zenith/issues/999
|
||||
if let Some(ancestor_timeline_id) = &timeline_entry.ancestor_timeline_id() {
|
||||
// If target_timeline is specified, we only need to know branchpoints of its children
|
||||
if let Some(timelineid) = target_timeline_id {
|
||||
if let Some(timelineid) = target_timelineid {
|
||||
if ancestor_timeline_id == &timelineid {
|
||||
all_branchpoints
|
||||
.insert((*ancestor_timeline_id, timeline_entry.ancestor_lsn()));
|
||||
@@ -921,7 +869,7 @@ impl LayeredRepository {
|
||||
|
||||
// Ok, we now know all the branch points.
|
||||
// Perform GC for each timeline.
|
||||
for timeline_id in timeline_ids.into_iter() {
|
||||
for timelineid in timeline_ids.into_iter() {
|
||||
if thread_mgr::is_shutdown_requested() {
|
||||
// We were requested to shut down. Stop and return with the progress we
|
||||
// made.
|
||||
@@ -930,12 +878,12 @@ impl LayeredRepository {
|
||||
|
||||
// Timeline is known to be local and loaded.
|
||||
let timeline = self
|
||||
.get_timeline_load_internal(timeline_id, &mut *timelines)?
|
||||
.get_timeline_load_internal(timelineid, &mut *timelines)?
|
||||
.expect("checked above that timeline is local and loaded");
|
||||
|
||||
// If target_timeline is specified, only GC it
|
||||
if let Some(target_timelineid) = target_timeline_id {
|
||||
if timeline_id != target_timelineid {
|
||||
if let Some(target_timelineid) = target_timelineid {
|
||||
if timelineid != target_timelineid {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -944,8 +892,8 @@ impl LayeredRepository {
|
||||
drop(timelines);
|
||||
let branchpoints: Vec<Lsn> = all_branchpoints
|
||||
.range((
|
||||
Included((timeline_id, Lsn(0))),
|
||||
Included((timeline_id, Lsn(u64::MAX))),
|
||||
Included((timelineid, Lsn(0))),
|
||||
Included((timelineid, Lsn(u64::MAX))),
|
||||
))
|
||||
.map(|&x| x.1)
|
||||
.collect();
|
||||
@@ -955,7 +903,7 @@ impl LayeredRepository {
|
||||
// used in tests, so we want as deterministic results as possible.
|
||||
if checkpoint_before_gc {
|
||||
timeline.checkpoint(CheckpointConfig::Forced)?;
|
||||
info!("timeline {} checkpoint_before_gc done", timeline_id);
|
||||
info!("timeline {} checkpoint_before_gc done", timelineid);
|
||||
}
|
||||
timeline.update_gc_info(branchpoints, cutoff, pitr);
|
||||
let result = timeline.gc()?;
|
||||
@@ -1640,7 +1588,7 @@ impl LayeredTimeline {
|
||||
Ok(layer)
|
||||
}
|
||||
|
||||
fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
//info!("PUT: key {} at {}", key, lsn);
|
||||
let layer = self.get_layer_for_write(lsn)?;
|
||||
layer.put_value(key, lsn, val)?;
|
||||
@@ -1768,29 +1716,24 @@ impl LayeredTimeline {
|
||||
|
||||
/// Flush one frozen in-memory layer to disk, as a new delta layer.
|
||||
fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> Result<()> {
|
||||
let layer_paths_to_upload;
|
||||
|
||||
// As a special case, when we have just imported an image into the repository,
|
||||
// instead of writing out a L0 delta layer, we directly write out image layer
|
||||
// files instead. This is possible as long as *all* the data imported into the
|
||||
// repository have the same LSN.
|
||||
let lsn_range = frozen_layer.get_lsn_range();
|
||||
if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
|
||||
let pgdir = tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)?;
|
||||
let (partitioning, _lsn) =
|
||||
pgdir.repartition(self.initdb_lsn, self.get_compaction_target_size())?;
|
||||
layer_paths_to_upload =
|
||||
self.create_image_layers(&partitioning, self.initdb_lsn, true)?;
|
||||
} else {
|
||||
// normal case, write out a L0 delta layer file.
|
||||
let delta_path = self.create_delta_layer(&frozen_layer)?;
|
||||
layer_paths_to_upload = HashSet::from([delta_path]);
|
||||
}
|
||||
let new_delta = frozen_layer.write_to_disk()?;
|
||||
let new_delta_path = new_delta.path();
|
||||
|
||||
// Sync the new layer to disk.
|
||||
//
|
||||
// We must also fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
//
|
||||
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
|
||||
// files to flush, it might be better to first write them all, and then fsync
|
||||
// them all in parallel.
|
||||
par_fsync::par_fsync(&[
|
||||
new_delta_path.clone(),
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
])?;
|
||||
fail_point!("flush-frozen-before-sync");
|
||||
|
||||
// The new on-disk layers are now in the layer map. We can remove the
|
||||
// in-memory layer from the map now.
|
||||
// Finally, replace the frozen in-memory layer with the new on-disk layer
|
||||
{
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
let l = layers.frozen_layers.pop_front();
|
||||
@@ -1800,27 +1743,19 @@ impl LayeredTimeline {
|
||||
// layer to disk at the same time, that would not work.
|
||||
assert!(Arc::ptr_eq(&l.unwrap(), &frozen_layer));
|
||||
|
||||
// Add the new delta layer to the LayerMap
|
||||
layers.insert_historic(Arc::new(new_delta));
|
||||
|
||||
// release lock on 'layers'
|
||||
}
|
||||
|
||||
fail_point!("checkpoint-after-sync");
|
||||
|
||||
// Update the metadata file, with new 'disk_consistent_lsn'
|
||||
//
|
||||
// TODO: This perhaps should be done in 'flush_frozen_layers', after flushing
|
||||
// *all* the layers, to avoid fsyncing the file multiple times.
|
||||
let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
|
||||
self.update_disk_consistent_lsn(disk_consistent_lsn, layer_paths_to_upload)?;
|
||||
let disk_consistent_lsn = Lsn(frozen_layer.get_lsn_range().end.0 - 1);
|
||||
fail_point!("checkpoint-after-sync");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Update metadata file
|
||||
fn update_disk_consistent_lsn(
|
||||
&self,
|
||||
disk_consistent_lsn: Lsn,
|
||||
layer_paths_to_upload: HashSet<PathBuf>,
|
||||
) -> Result<()> {
|
||||
// If we were able to advance 'disk_consistent_lsn', save it the metadata file.
|
||||
// After crash, we will restart WAL streaming and processing from that point.
|
||||
let old_disk_consistent_lsn = self.disk_consistent_lsn.load();
|
||||
@@ -1870,11 +1805,14 @@ impl LayeredTimeline {
|
||||
false,
|
||||
)?;
|
||||
|
||||
NUM_PERSISTENT_FILES_CREATED.inc_by(1);
|
||||
PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len());
|
||||
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
layer_paths_to_upload,
|
||||
HashSet::from([new_delta_path]),
|
||||
Some(metadata),
|
||||
);
|
||||
}
|
||||
@@ -1886,37 +1824,6 @@ impl LayeredTimeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write out the given frozen in-memory layer as a new L0 delta file
|
||||
fn create_delta_layer(&self, frozen_layer: &InMemoryLayer) -> Result<PathBuf> {
|
||||
// Write it out
|
||||
let new_delta = frozen_layer.write_to_disk()?;
|
||||
let new_delta_path = new_delta.path();
|
||||
|
||||
// Sync it to disk.
|
||||
//
|
||||
// We must also fsync the timeline dir to ensure the directory entries for
|
||||
// new layer files are durable
|
||||
//
|
||||
// TODO: If we're running inside 'flush_frozen_layers' and there are multiple
|
||||
// files to flush, it might be better to first write them all, and then fsync
|
||||
// them all in parallel.
|
||||
par_fsync::par_fsync(&[
|
||||
new_delta_path.clone(),
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
])?;
|
||||
|
||||
// Add it to the layer map
|
||||
{
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
layers.insert_historic(Arc::new(new_delta));
|
||||
}
|
||||
|
||||
NUM_PERSISTENT_FILES_CREATED.inc_by(1);
|
||||
PERSISTENT_BYTES_WRITTEN.inc_by(new_delta_path.metadata()?.len());
|
||||
|
||||
Ok(new_delta_path)
|
||||
}
|
||||
|
||||
pub fn compact(&self) -> Result<()> {
|
||||
//
|
||||
// High level strategy for compaction / image creation:
|
||||
@@ -1960,23 +1867,29 @@ impl LayeredTimeline {
|
||||
if let Ok(pgdir) =
|
||||
tenant_mgr::get_local_timeline_with_load(self.tenant_id, self.timeline_id)
|
||||
{
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let (partitioning, lsn) = pgdir.repartition(
|
||||
self.get_last_record_lsn(),
|
||||
self.get_compaction_target_size(),
|
||||
)?;
|
||||
let layer_paths_to_upload = self.create_image_layers(&partitioning, lsn, false)?;
|
||||
if !layer_paths_to_upload.is_empty()
|
||||
&& self.upload_layers.load(atomic::Ordering::Relaxed)
|
||||
{
|
||||
let timer = self.create_images_time_histo.start_timer();
|
||||
// 2. Create new image layers for partitions that have been modified
|
||||
// "enough".
|
||||
let mut layer_paths_to_upload = HashSet::with_capacity(partitioning.parts.len());
|
||||
for part in partitioning.parts.iter() {
|
||||
if self.time_for_new_image_layer(part, lsn)? {
|
||||
let new_path = self.create_image_layer(part, lsn)?;
|
||||
layer_paths_to_upload.insert(new_path);
|
||||
}
|
||||
}
|
||||
if self.upload_layers.load(atomic::Ordering::Relaxed) {
|
||||
storage_sync::schedule_layer_upload(
|
||||
self.tenant_id,
|
||||
self.timeline_id,
|
||||
HashSet::from_iter(layer_paths_to_upload),
|
||||
layer_paths_to_upload,
|
||||
None,
|
||||
);
|
||||
}
|
||||
timer.stop_and_record();
|
||||
|
||||
// 3. Compact
|
||||
let timer = self.compact_time_histo.start_timer();
|
||||
@@ -2001,28 +1914,15 @@ impl LayeredTimeline {
|
||||
} else {
|
||||
Lsn(0)
|
||||
};
|
||||
// Let's consider an example:
|
||||
//
|
||||
// delta layer with LSN range 71-81
|
||||
// delta layer with LSN range 81-91
|
||||
// delta layer with LSN range 91-101
|
||||
// image layer at LSN 100
|
||||
//
|
||||
// If 'lsn' is still 100, i.e. no new WAL has been processed since the last image layer,
|
||||
// there's no need to create a new one. We check this case explicitly, to avoid passing
|
||||
// a bogus range to count_deltas below, with start > end. It's even possible that there
|
||||
// are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
|
||||
// after we read last_record_lsn, which is passed here in the 'lsn' argument.
|
||||
if img_lsn < lsn {
|
||||
let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
|
||||
|
||||
debug!(
|
||||
"key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
|
||||
img_range.start, img_range.end, num_deltas, img_lsn, lsn
|
||||
);
|
||||
if num_deltas >= self.get_image_creation_threshold() {
|
||||
return Ok(true);
|
||||
}
|
||||
let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
|
||||
|
||||
debug!(
|
||||
"range {}-{}, has {} deltas on this timeline",
|
||||
img_range.start, img_range.end, num_deltas
|
||||
);
|
||||
if num_deltas >= self.get_image_creation_threshold() {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2030,40 +1930,21 @@ impl LayeredTimeline {
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
fn create_image_layers(
|
||||
&self,
|
||||
partitioning: &KeyPartitioning,
|
||||
lsn: Lsn,
|
||||
force: bool,
|
||||
) -> Result<HashSet<PathBuf>> {
|
||||
let timer = self.create_images_time_histo.start_timer();
|
||||
let mut image_layers: Vec<ImageLayer> = Vec::new();
|
||||
let mut layer_paths_to_upload = HashSet::new();
|
||||
for partition in partitioning.parts.iter() {
|
||||
if force || self.time_for_new_image_layer(partition, lsn)? {
|
||||
let img_range =
|
||||
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
|
||||
let mut image_layer_writer = ImageLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_id,
|
||||
&img_range,
|
||||
lsn,
|
||||
)?;
|
||||
fn create_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<PathBuf> {
|
||||
let img_range =
|
||||
partition.ranges.first().unwrap().start..partition.ranges.last().unwrap().end;
|
||||
let mut image_layer_writer =
|
||||
ImageLayerWriter::new(self.conf, self.timeline_id, self.tenant_id, &img_range, lsn)?;
|
||||
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
let img = self.get(key, lsn)?;
|
||||
image_layer_writer.put_image(key, &img)?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish()?;
|
||||
layer_paths_to_upload.insert(image_layer.path());
|
||||
image_layers.push(image_layer);
|
||||
for range in &partition.ranges {
|
||||
let mut key = range.start;
|
||||
while key < range.end {
|
||||
let img = self.get(key, lsn)?;
|
||||
image_layer_writer.put_image(key, &img)?;
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
let image_layer = image_layer_writer.finish()?;
|
||||
|
||||
// Sync the new layer to disk before adding it to the layer map, to make sure
|
||||
// we don't garbage collect something based on the new layer, before it has
|
||||
@@ -2074,18 +1955,19 @@ impl LayeredTimeline {
|
||||
//
|
||||
// Compaction creates multiple image layers. It would be better to create them all
|
||||
// and fsync them all in parallel.
|
||||
let mut all_paths = Vec::from_iter(layer_paths_to_upload.clone());
|
||||
all_paths.push(self.conf.timeline_path(&self.timeline_id, &self.tenant_id));
|
||||
par_fsync::par_fsync(&all_paths)?;
|
||||
par_fsync::par_fsync(&[
|
||||
image_layer.path(),
|
||||
self.conf.timeline_path(&self.timeline_id, &self.tenant_id),
|
||||
])?;
|
||||
|
||||
// FIXME: Do we need to do something to upload it to remote storage here?
|
||||
|
||||
let mut layers = self.layers.write().unwrap();
|
||||
for l in image_layers {
|
||||
layers.insert_historic(Arc::new(l));
|
||||
}
|
||||
let new_path = image_layer.path();
|
||||
layers.insert_historic(Arc::new(image_layer));
|
||||
drop(layers);
|
||||
timer.stop_and_record();
|
||||
|
||||
Ok(layer_paths_to_upload)
|
||||
Ok(new_path)
|
||||
}
|
||||
|
||||
///
|
||||
@@ -2332,9 +2214,6 @@ impl LayeredTimeline {
|
||||
LsnForTimestamp::Past(lsn) => {
|
||||
debug!("past({})", lsn);
|
||||
}
|
||||
LsnForTimestamp::NoData(lsn) => {
|
||||
debug!("nodata({})", lsn);
|
||||
}
|
||||
}
|
||||
debug!("pitr_cutoff_lsn = {:?}", pitr_cutoff_lsn)
|
||||
}
|
||||
@@ -2608,7 +2487,7 @@ impl Deref for LayeredTimelineWriter<'_> {
|
||||
}
|
||||
|
||||
impl<'a> TimelineWriter<'_> for LayeredTimelineWriter<'a> {
|
||||
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()> {
|
||||
fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()> {
|
||||
self.tl.put_value(key, lsn, value)
|
||||
}
|
||||
|
||||
@@ -2750,7 +2629,7 @@ pub mod tests {
|
||||
let TEST_KEY: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.put(TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
@@ -2758,7 +2637,7 @@ pub mod tests {
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.put(TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
@@ -2766,7 +2645,7 @@ pub mod tests {
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||
writer.put(TEST_KEY, Lsn(0x30), Value::Image(TEST_IMG("foo at 0x30")))?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
drop(writer);
|
||||
|
||||
@@ -2774,7 +2653,7 @@ pub mod tests {
|
||||
tline.compact()?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||
writer.put(TEST_KEY, Lsn(0x40), Value::Image(TEST_IMG("foo at 0x40")))?;
|
||||
writer.finish_write(Lsn(0x40));
|
||||
drop(writer);
|
||||
|
||||
@@ -2812,7 +2691,7 @@ pub mod tests {
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
@@ -2858,7 +2737,7 @@ pub mod tests {
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
updated[blknum] = lsn;
|
||||
@@ -2876,7 +2755,7 @@ pub mod tests {
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
drop(writer);
|
||||
@@ -2928,7 +2807,7 @@ pub mod tests {
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
updated[blknum] = lsn;
|
||||
@@ -2940,7 +2819,7 @@ pub mod tests {
|
||||
let mut tline_id = TIMELINE_ID;
|
||||
for _ in 0..50 {
|
||||
let new_tline_id = ZTimelineId::generate();
|
||||
repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
|
||||
repo.branch_timeline(tline_id, new_tline_id, lsn)?;
|
||||
tline = repo.get_timeline_load(new_tline_id)?;
|
||||
tline_id = new_tline_id;
|
||||
|
||||
@@ -2952,7 +2831,7 @@ pub mod tests {
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
Value::Image(TEST_IMG(&format!("{} at {}", blknum, lsn))),
|
||||
)?;
|
||||
println!("updating {} at {}", blknum, lsn);
|
||||
writer.finish_write(lsn);
|
||||
@@ -2999,7 +2878,7 @@ pub mod tests {
|
||||
#[allow(clippy::needless_range_loop)]
|
||||
for idx in 0..NUM_TLINES {
|
||||
let new_tline_id = ZTimelineId::generate();
|
||||
repo.branch_timeline(tline_id, new_tline_id, Some(lsn))?;
|
||||
repo.branch_timeline(tline_id, new_tline_id, lsn)?;
|
||||
tline = repo.get_timeline_load(new_tline_id)?;
|
||||
tline_id = new_tline_id;
|
||||
|
||||
@@ -3011,7 +2890,7 @@ pub mod tests {
|
||||
writer.put(
|
||||
test_key,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
Value::Image(TEST_IMG(&format!("{} {} at {}", idx, blknum, lsn))),
|
||||
)?;
|
||||
println!("updating [{}][{}] at {}", idx, blknum, lsn);
|
||||
writer.finish_write(lsn);
|
||||
|
||||
@@ -34,7 +34,7 @@ pub trait BlobCursor {
|
||||
) -> Result<(), std::io::Error>;
|
||||
}
|
||||
|
||||
impl<R> BlobCursor for BlockCursor<R>
|
||||
impl<'a, R> BlobCursor for BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
|
||||
@@ -445,10 +445,7 @@ impl ImageLayerWriter {
|
||||
},
|
||||
);
|
||||
info!("new image layer {}", path.display());
|
||||
let mut file = VirtualFile::open_with_options(
|
||||
&path,
|
||||
std::fs::OpenOptions::new().write(true).create_new(true),
|
||||
)?;
|
||||
let mut file = VirtualFile::create(&path)?;
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
||||
let blob_writer = WriteBlobWriter::new(file, PAGE_SZ as u64);
|
||||
|
||||
@@ -267,13 +267,13 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.assert_writeable();
|
||||
|
||||
let off = inner.file.write_blob(&Value::ser(val)?)?;
|
||||
let off = inner.file.write_blob(&Value::ser(&val)?)?;
|
||||
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
|
||||
@@ -13,7 +13,7 @@ pub mod repository;
|
||||
pub mod storage_sync;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_tasks;
|
||||
pub mod tenant_threads;
|
||||
pub mod thread_mgr;
|
||||
pub mod timelines;
|
||||
pub mod virtual_file;
|
||||
|
||||
@@ -554,7 +554,7 @@ impl PageServerHandler {
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
|
||||
let timeline = repo.create_empty_timeline(timeline_id, Lsn(0))?;
|
||||
let repartition_distance = repo.get_checkpoint_distance();
|
||||
let mut datadir_timeline =
|
||||
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
|
||||
@@ -772,7 +772,6 @@ impl PageServerHandler {
|
||||
pgb: &mut PostgresBackend,
|
||||
timelineid: ZTimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
tenantid: ZTenantId,
|
||||
full_backup: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -797,8 +796,7 @@ impl PageServerHandler {
|
||||
{
|
||||
let mut writer = CopyDataSink { pgb };
|
||||
|
||||
let basebackup =
|
||||
basebackup::Basebackup::new(&mut writer, &timeline, lsn, prev_lsn, full_backup)?;
|
||||
let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn, full_backup)?;
|
||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()?;
|
||||
}
|
||||
@@ -901,67 +899,33 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
};
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, None, tenantid, false)?;
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, false)?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
// return pair of prev_lsn and last_lsn
|
||||
else if query_string.starts_with("get_last_record_rlsn ") {
|
||||
let (_, params_raw) = query_string.split_at("get_last_record_rlsn ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
ensure!(
|
||||
params.len() == 2,
|
||||
"invalid param number for get_last_record_rlsn command"
|
||||
);
|
||||
|
||||
let tenantid = ZTenantId::from_str(params[0])?;
|
||||
let timelineid = ZTimelineId::from_str(params[1])?;
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
let end_of_timeline = timeline.tline.get_last_record_rlsn();
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::text_col(b"prev_lsn"),
|
||||
RowDescriptor::text_col(b"last_lsn"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(end_of_timeline.prev.to_string().as_bytes()),
|
||||
Some(end_of_timeline.last.to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if query_string.starts_with("fullbackup ") {
|
||||
let (_, params_raw) = query_string.split_at("fullbackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
|
||||
ensure!(
|
||||
params.len() >= 2,
|
||||
params.len() == 3,
|
||||
"invalid param number for fullbackup command"
|
||||
);
|
||||
|
||||
let tenantid = ZTenantId::from_str(params[0])?;
|
||||
let timelineid = ZTimelineId::from_str(params[1])?;
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if params.len() > 2 {
|
||||
Some(Lsn::from_str(params[2])?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if params.len() > 3 {
|
||||
Some(Lsn::from_str(params[3])?)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.check_permission(Some(tenantid))?;
|
||||
|
||||
// Lsn is required for fullbackup, because otherwise we would not know
|
||||
// at which lsn to upload this backup.
|
||||
//
|
||||
// The caller is responsible for providing a valid lsn
|
||||
// and using it in the subsequent import.
|
||||
let lsn = Some(Lsn::from_str(params[2])?);
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, prev_lsn, tenantid, true)?;
|
||||
self.handle_basebackup_request(pgb, timelineid, lsn, tenantid, true)?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("import basebackup ") {
|
||||
// Import the `base` section (everything but the wal) of a basebackup.
|
||||
@@ -987,10 +951,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) {
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||
}
|
||||
Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
|
||||
};
|
||||
} else if query_string.starts_with("import wal ") {
|
||||
// Import the `pg_wal` section of a basebackup.
|
||||
@@ -1009,10 +970,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) {
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||
}
|
||||
Err(e) => pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?,
|
||||
};
|
||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
@@ -1193,7 +1151,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
|
||||
@@ -51,7 +51,6 @@ pub enum LsnForTimestamp {
|
||||
Present(Lsn),
|
||||
Future(Lsn),
|
||||
Past(Lsn),
|
||||
NoData(Lsn),
|
||||
}
|
||||
|
||||
impl<R: Repository> DatadirTimeline<R> {
|
||||
@@ -264,7 +263,7 @@ impl<R: Repository> DatadirTimeline<R> {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
// just after importing a cluster.
|
||||
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||
bail!("no commit timestamps found");
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
@@ -902,57 +901,6 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Flush changes accumulated so far to the underlying repository.
|
||||
///
|
||||
/// Usually, changes made in DatadirModification are atomic, but this allows
|
||||
/// you to flush them to the underlying repository before the final `commit`.
|
||||
/// That allows to free up the memory used to hold the pending changes.
|
||||
///
|
||||
/// Currently only used during bulk import of a data directory. In that
|
||||
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
||||
/// whole import fails and the timeline will be deleted anyway.
|
||||
/// (Or to be precise, it will be left behind for debugging purposes and
|
||||
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
|
||||
///
|
||||
/// Note: A consequence of flushing the pending operations is that they
|
||||
/// won't be visible to subsequent operations until `commit`. The function
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub fn flush(&mut self) -> Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
if pending_nblocks < 10000 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let writer = self.tline.tline.writer();
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut result: Result<()> = Ok(());
|
||||
self.pending_updates.retain(|&key, value| {
|
||||
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
|
||||
result = writer.put(key, self.lsn, value);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
});
|
||||
result?;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
self.tline.current_logical_size.fetch_add(
|
||||
pending_nblocks * pg_constants::BLCKSZ as isize,
|
||||
Ordering::SeqCst,
|
||||
);
|
||||
self.pending_nblocks = 0;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish this atomic update, writing all the updated keys to the
|
||||
/// underlying timeline.
|
||||
@@ -963,7 +911,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
|
||||
for (key, value) in self.pending_updates {
|
||||
writer.put(key, self.lsn, &value)?;
|
||||
writer.put(key, self.lsn, value)?;
|
||||
}
|
||||
for key_range in self.pending_deletions {
|
||||
writer.delete(key_range.clone(), self.lsn)?;
|
||||
@@ -1368,10 +1316,6 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_rel_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
@@ -1390,12 +1334,6 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
//
|
||||
//-- Tests that should work the same with any Repository/Timeline implementation.
|
||||
//
|
||||
|
||||
@@ -81,12 +81,6 @@ mod profiling_impl {
|
||||
|
||||
pub struct DummyProfilerGuard;
|
||||
|
||||
impl Drop for DummyProfilerGuard {
|
||||
fn drop(&mut self) {
|
||||
// do nothing, this exists to calm Clippy down
|
||||
}
|
||||
}
|
||||
|
||||
pub fn profpoint_start(
|
||||
_conf: &PageServerConf,
|
||||
_point: ProfilingConfig,
|
||||
|
||||
@@ -7,6 +7,7 @@ use byteorder::{ByteOrder, BE};
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fmt;
|
||||
use std::fmt::Display;
|
||||
use std::ops::{AddAssign, Range};
|
||||
use std::sync::{Arc, RwLockReadGuard};
|
||||
use std::time::Duration;
|
||||
@@ -181,6 +182,20 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub enum TimelineSyncStatusUpdate {
|
||||
Downloaded,
|
||||
}
|
||||
|
||||
impl Display for TimelineSyncStatusUpdate {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
TimelineSyncStatusUpdate::Downloaded => "Downloaded",
|
||||
};
|
||||
f.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// A repository corresponds to one .neon directory. One repository holds multiple
|
||||
/// timelines, forked off from the same initial call to 'initdb'.
|
||||
@@ -189,7 +204,11 @@ pub trait Repository: Send + Sync {
|
||||
|
||||
/// Updates timeline based on the `TimelineSyncStatusUpdate`, received from the remote storage synchronization.
|
||||
/// See [`crate::remote_storage`] for more details about the synchronization.
|
||||
fn attach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
fn apply_timeline_remote_sync_status_update(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
timeline_sync_status_update: TimelineSyncStatusUpdate,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Get Timeline handle for given zenith timeline ID.
|
||||
/// This function is idempotent. It doesn't change internal state in any way.
|
||||
@@ -206,17 +225,12 @@ pub trait Repository: Send + Sync {
|
||||
/// Initdb lsn is provided for timeline impl to be able to perform checks for some operations against it.
|
||||
fn create_empty_timeline(
|
||||
&self,
|
||||
timeline_id: ZTimelineId,
|
||||
timelineid: ZTimelineId,
|
||||
initdb_lsn: Lsn,
|
||||
) -> Result<Arc<Self::Timeline>>;
|
||||
|
||||
/// Branch a timeline
|
||||
fn branch_timeline(
|
||||
&self,
|
||||
src: ZTimelineId,
|
||||
dst: ZTimelineId,
|
||||
start_lsn: Option<Lsn>,
|
||||
) -> Result<()>;
|
||||
fn branch_timeline(&self, src: ZTimelineId, dst: ZTimelineId, start_lsn: Lsn) -> Result<()>;
|
||||
|
||||
/// Flush all data to disk.
|
||||
///
|
||||
@@ -246,10 +260,10 @@ pub trait Repository: Send + Sync {
|
||||
/// api's 'compact' command.
|
||||
fn compaction_iteration(&self) -> Result<()>;
|
||||
|
||||
/// removes timeline-related in-memory data
|
||||
fn delete_timeline(&self, timeline_id: ZTimelineId) -> anyhow::Result<()>;
|
||||
/// detaches timeline-related in-memory data.
|
||||
fn detach_timeline(&self, timeline_id: ZTimelineId) -> Result<()>;
|
||||
|
||||
/// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
|
||||
// Allows to retrieve remote timeline index from the repo. Used in walreceiver to grab remote consistent lsn.
|
||||
fn get_remote_index(&self) -> &RemoteIndex;
|
||||
}
|
||||
|
||||
@@ -393,7 +407,7 @@ pub trait TimelineWriter<'a> {
|
||||
///
|
||||
/// This will implicitly extend the relation, if the page is beyond the
|
||||
/// current end-of-file.
|
||||
fn put(&self, key: Key, lsn: Lsn, value: &Value) -> Result<()>;
|
||||
fn put(&self, key: Key, lsn: Lsn, value: Value) -> Result<()>;
|
||||
|
||||
fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> Result<()>;
|
||||
|
||||
@@ -523,7 +537,7 @@ pub mod repo_harness {
|
||||
TenantConfOpt::from(self.tenant_conf),
|
||||
walredo_mgr,
|
||||
self.tenant_id,
|
||||
RemoteIndex::default(),
|
||||
RemoteIndex::empty(),
|
||||
false,
|
||||
);
|
||||
// populate repo with locally available timelines
|
||||
@@ -539,7 +553,10 @@ pub mod repo_harness {
|
||||
.parse()
|
||||
.unwrap();
|
||||
|
||||
repo.attach_timeline(timeline_id)?;
|
||||
repo.apply_timeline_remote_sync_status_update(
|
||||
timeline_id,
|
||||
TimelineSyncStatusUpdate::Downloaded,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(repo)
|
||||
@@ -603,12 +620,12 @@ mod tests {
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.put(*TEST_KEY, Lsn(0x10), Value::Image(TEST_IMG("foo at 0x10")))?;
|
||||
writer.finish_write(Lsn(0x10));
|
||||
drop(writer);
|
||||
|
||||
let writer = tline.writer();
|
||||
writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.put(*TEST_KEY, Lsn(0x20), Value::Image(TEST_IMG("foo at 0x20")))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
drop(writer);
|
||||
|
||||
@@ -619,19 +636,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_duplicate_timelines() -> Result<()> {
|
||||
let repo = RepoHarness::create("no_duplicate_timelines")?.load();
|
||||
let _ = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
|
||||
match repo.create_empty_timeline(TIMELINE_ID, Lsn(0)) {
|
||||
Ok(_) => panic!("duplicate timeline creation should fail"),
|
||||
Err(e) => assert_eq!(e.to_string(), "Timeline already exists"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convenience function to create a page image with given string as the only content
|
||||
pub fn test_value(s: &str) -> Value {
|
||||
let mut buf = BytesMut::new();
|
||||
@@ -655,24 +659,24 @@ mod tests {
|
||||
let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();
|
||||
|
||||
// Insert a value on the timeline
|
||||
writer.put(TEST_KEY_A, Lsn(0x20), &test_value("foo at 0x20"))?;
|
||||
writer.put(TEST_KEY_B, Lsn(0x20), &test_value("foobar at 0x20"))?;
|
||||
writer.put(TEST_KEY_A, Lsn(0x20), test_value("foo at 0x20"))?;
|
||||
writer.put(TEST_KEY_B, Lsn(0x20), test_value("foobar at 0x20"))?;
|
||||
writer.finish_write(Lsn(0x20));
|
||||
|
||||
writer.put(TEST_KEY_A, Lsn(0x30), &test_value("foo at 0x30"))?;
|
||||
writer.put(TEST_KEY_A, Lsn(0x30), test_value("foo at 0x30"))?;
|
||||
writer.finish_write(Lsn(0x30));
|
||||
writer.put(TEST_KEY_A, Lsn(0x40), &test_value("foo at 0x40"))?;
|
||||
writer.put(TEST_KEY_A, Lsn(0x40), test_value("foo at 0x40"))?;
|
||||
writer.finish_write(Lsn(0x40));
|
||||
|
||||
//assert_current_logical_size(&tline, Lsn(0x40));
|
||||
|
||||
// Branch the history, modify relation differently on the new timeline
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))?;
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
let new_writer = newtline.writer();
|
||||
new_writer.put(TEST_KEY_A, Lsn(0x40), &test_value("bar at 0x40"))?;
|
||||
new_writer.put(TEST_KEY_A, Lsn(0x40), test_value("bar at 0x40"))?;
|
||||
new_writer.finish_write(Lsn(0x40));
|
||||
|
||||
// Check page contents on both branches
|
||||
@@ -703,14 +707,14 @@ mod tests {
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
@@ -721,14 +725,14 @@ mod tests {
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
lsn += 0x10;
|
||||
writer.put(
|
||||
*TEST_KEY,
|
||||
lsn,
|
||||
&Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
Value::Image(TEST_IMG(&format!("foo at {}", lsn))),
|
||||
)?;
|
||||
writer.finish_write(lsn);
|
||||
}
|
||||
@@ -749,7 +753,7 @@ mod tests {
|
||||
repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, false)?;
|
||||
|
||||
// try to branch at lsn 25, should fail because we already garbage collected the data
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(err.to_string().contains("invalid branch start lsn"));
|
||||
@@ -770,7 +774,7 @@ mod tests {
|
||||
|
||||
repo.create_empty_timeline(TIMELINE_ID, Lsn(0x50))?;
|
||||
// try to branch at lsn 0x25, should fail because initdb lsn is 0x50
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25))) {
|
||||
match repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x25)) {
|
||||
Ok(_) => panic!("branching should have failed"),
|
||||
Err(err) => {
|
||||
assert!(&err.to_string().contains("invalid branch start lsn"));
|
||||
@@ -815,7 +819,7 @@ mod tests {
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
@@ -831,7 +835,7 @@ mod tests {
|
||||
let tline = repo.create_empty_timeline(TIMELINE_ID, Lsn(0))?;
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
.expect("Should have a local timeline");
|
||||
@@ -889,7 +893,7 @@ mod tests {
|
||||
make_some_layers(tline.as_ref(), Lsn(0x20))?;
|
||||
tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))?;
|
||||
repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x40))?;
|
||||
|
||||
let newtline = repo
|
||||
.get_timeline_load(NEW_TIMELINE_ID)
|
||||
|
||||
@@ -178,8 +178,9 @@ use crate::{
|
||||
metadata::{metadata_path, TimelineMetadata, METADATA_FILE_NAME},
|
||||
LayeredRepository,
|
||||
},
|
||||
repository::TimelineSyncStatusUpdate,
|
||||
storage_sync::{self, index::RemoteIndex},
|
||||
tenant_mgr::attach_downloaded_tenants,
|
||||
tenant_mgr::apply_timeline_sync_status_updates,
|
||||
thread_mgr,
|
||||
thread_mgr::ThreadKind,
|
||||
};
|
||||
@@ -190,8 +191,7 @@ use metrics::{
|
||||
};
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use self::download::download_index_parts;
|
||||
pub use self::download::gather_tenant_timelines_index_parts;
|
||||
pub use self::download::download_index_part;
|
||||
pub use self::download::TEMP_DOWNLOAD_EXTENSION;
|
||||
|
||||
lazy_static! {
|
||||
@@ -301,7 +301,7 @@ pub fn start_local_timeline_sync(
|
||||
}
|
||||
Ok(SyncStartupData {
|
||||
local_timeline_init_statuses,
|
||||
remote_index: RemoteIndex::default(),
|
||||
remote_index: RemoteIndex::empty(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -835,7 +835,7 @@ where
|
||||
.build()
|
||||
.context("Failed to create storage sync runtime")?;
|
||||
|
||||
let applicable_index_parts = runtime.block_on(download_index_parts(
|
||||
let applicable_index_parts = runtime.block_on(try_fetch_index_parts(
|
||||
conf,
|
||||
&storage,
|
||||
local_timeline_files.keys().copied().collect(),
|
||||
@@ -918,48 +918,16 @@ fn storage_sync_loop<P, S>(
|
||||
});
|
||||
|
||||
match loop_step {
|
||||
ControlFlow::Continue(updated_tenants) => {
|
||||
if updated_tenants.is_empty() {
|
||||
debug!("Sync loop step completed, no new tenant states");
|
||||
ControlFlow::Continue(new_timeline_states) => {
|
||||
if new_timeline_states.is_empty() {
|
||||
debug!("Sync loop step completed, no new timeline states");
|
||||
} else {
|
||||
info!(
|
||||
"Sync loop step completed, {} new tenant state update(s)",
|
||||
updated_tenants.len()
|
||||
"Sync loop step completed, {} new timeline state update(s)",
|
||||
new_timeline_states.len()
|
||||
);
|
||||
let mut sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>> =
|
||||
HashMap::new();
|
||||
let index_accessor = runtime.block_on(index.write());
|
||||
for tenant_id in updated_tenants {
|
||||
let tenant_entry = match index_accessor.tenant_entry(&tenant_id) {
|
||||
Some(tenant_entry) => tenant_entry,
|
||||
None => {
|
||||
error!(
|
||||
"cannot find tenant in remote index for timeline sync update"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
if tenant_entry.has_in_progress_downloads() {
|
||||
info!("Tenant {tenant_id} has pending timeline downloads, skipping repository registration");
|
||||
continue;
|
||||
} else {
|
||||
info!(
|
||||
"Tenant {tenant_id} download completed. Picking to register in repository"
|
||||
);
|
||||
// Here we assume that if tenant has no in-progress downloads that
|
||||
// means that it is the last completed timeline download that triggered
|
||||
// sync status update. So we look at the index for available timelines
|
||||
// and register them all at once in a repository for download
|
||||
// to be submitted in a single operation to repository
|
||||
// so it can apply them at once to internal timeline map.
|
||||
sync_status_updates
|
||||
.insert(tenant_id, tenant_entry.keys().copied().collect());
|
||||
}
|
||||
}
|
||||
drop(index_accessor);
|
||||
// Batch timeline download registration to ensure that the external registration code won't block any running tasks before.
|
||||
attach_downloaded_tenants(conf, &index, sync_status_updates);
|
||||
apply_timeline_sync_status_updates(conf, &index, new_timeline_states);
|
||||
}
|
||||
}
|
||||
ControlFlow::Break(()) => {
|
||||
@@ -970,14 +938,6 @@ fn storage_sync_loop<P, S>(
|
||||
}
|
||||
}
|
||||
|
||||
// needed to check whether the download happened
|
||||
// more informative than just a bool
|
||||
#[derive(Debug)]
|
||||
enum DownloadMarker {
|
||||
Downloaded,
|
||||
Nothing,
|
||||
}
|
||||
|
||||
async fn process_batches<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
max_sync_errors: NonZeroU32,
|
||||
@@ -985,7 +945,7 @@ async fn process_batches<P, S>(
|
||||
index: &RemoteIndex,
|
||||
batched_tasks: HashMap<ZTenantTimelineId, SyncTaskBatch>,
|
||||
sync_queue: &SyncQueue,
|
||||
) -> HashSet<ZTenantId>
|
||||
) -> HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
@@ -1010,19 +970,22 @@ where
|
||||
})
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
let mut downloaded_timelines = HashSet::new();
|
||||
let mut new_timeline_states: HashMap<
|
||||
ZTenantId,
|
||||
HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
|
||||
> = HashMap::new();
|
||||
|
||||
while let Some((sync_id, download_marker)) = sync_results.next().await {
|
||||
debug!(
|
||||
"Finished storage sync task for sync id {sync_id} download marker {:?}",
|
||||
download_marker
|
||||
);
|
||||
if matches!(download_marker, DownloadMarker::Downloaded) {
|
||||
downloaded_timelines.insert(sync_id.tenant_id);
|
||||
while let Some((sync_id, state_update)) = sync_results.next().await {
|
||||
debug!("Finished storage sync task for sync id {sync_id}");
|
||||
if let Some(state_update) = state_update {
|
||||
new_timeline_states
|
||||
.entry(sync_id.tenant_id)
|
||||
.or_default()
|
||||
.insert(sync_id.timeline_id, state_update);
|
||||
}
|
||||
}
|
||||
|
||||
downloaded_timelines
|
||||
new_timeline_states
|
||||
}
|
||||
|
||||
async fn process_sync_task_batch<P, S>(
|
||||
@@ -1031,7 +994,7 @@ async fn process_sync_task_batch<P, S>(
|
||||
max_sync_errors: NonZeroU32,
|
||||
sync_id: ZTenantTimelineId,
|
||||
batch: SyncTaskBatch,
|
||||
) -> DownloadMarker
|
||||
) -> Option<TimelineSyncStatusUpdate>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
@@ -1116,7 +1079,7 @@ where
|
||||
}
|
||||
}
|
||||
}
|
||||
DownloadMarker::Nothing
|
||||
None
|
||||
}
|
||||
.instrument(info_span!("download_timeline_data")),
|
||||
);
|
||||
@@ -1170,7 +1133,7 @@ async fn download_timeline_data<P, S>(
|
||||
new_download_data: SyncData<LayersDownload>,
|
||||
sync_start: Instant,
|
||||
task_name: &str,
|
||||
) -> DownloadMarker
|
||||
) -> Option<TimelineSyncStatusUpdate>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
@@ -1199,7 +1162,7 @@ where
|
||||
Ok(()) => match index.write().await.set_awaits_download(&sync_id, false) {
|
||||
Ok(()) => {
|
||||
register_sync_status(sync_id, sync_start, task_name, Some(true));
|
||||
return DownloadMarker::Downloaded;
|
||||
return Some(TimelineSyncStatusUpdate::Downloaded);
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Timeline {sync_id} was expected to be in the remote index after a successful download, but it's absent: {e:?}");
|
||||
@@ -1215,7 +1178,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
DownloadMarker::Nothing
|
||||
None
|
||||
}
|
||||
|
||||
async fn update_local_metadata(
|
||||
@@ -1495,6 +1458,35 @@ async fn validate_task_retries<T>(
|
||||
ControlFlow::Continue(sync_data)
|
||||
}
|
||||
|
||||
async fn try_fetch_index_parts<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
keys: HashSet<ZTenantTimelineId>,
|
||||
) -> HashMap<ZTenantTimelineId, IndexPart>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
let mut index_parts = HashMap::with_capacity(keys.len());
|
||||
|
||||
let mut part_downloads = keys
|
||||
.into_iter()
|
||||
.map(|id| async move { (id, download_index_part(conf, storage, id).await) })
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
while let Some((id, part_upload_result)) = part_downloads.next().await {
|
||||
match part_upload_result {
|
||||
Ok(index_part) => {
|
||||
debug!("Successfully fetched index part for {id}");
|
||||
index_parts.insert(id, index_part);
|
||||
}
|
||||
Err(e) => warn!("Failed to fetch index part for {id}: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
index_parts
|
||||
}
|
||||
|
||||
fn schedule_first_sync_tasks(
|
||||
index: &mut RemoteTimelineIndex,
|
||||
sync_queue: &SyncQueue,
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
//! Timeline synchronization logic to fetch the layer files from remote storage into pageserver's local directory.
|
||||
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
fmt::Debug,
|
||||
path::Path,
|
||||
};
|
||||
use std::{collections::HashSet, fmt::Debug, path::Path};
|
||||
|
||||
use anyhow::Context;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use remote_storage::{path_with_suffix_extension, RemoteObjectName, RemoteStorage};
|
||||
use remote_storage::{path_with_suffix_extension, RemoteStorage};
|
||||
use tokio::{
|
||||
fs,
|
||||
io::{self, AsyncWriteExt},
|
||||
@@ -18,7 +14,7 @@ use tracing::{debug, error, info, warn};
|
||||
use crate::{
|
||||
config::PageServerConf, layered_repository::metadata::metadata_path, storage_sync::SyncTask,
|
||||
};
|
||||
use utils::zid::{ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
use utils::zid::ZTenantTimelineId;
|
||||
|
||||
use super::{
|
||||
index::{IndexPart, RemoteTimeline},
|
||||
@@ -27,108 +23,8 @@ use super::{
|
||||
|
||||
pub const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
|
||||
|
||||
/// FIXME: Needs cleanup. Currently it swallows errors. Here we need to ensure that
|
||||
/// we successfully downloaded all metadata parts for one tenant.
|
||||
/// And successful includes absence of index_part in the remote. Because it is valid situation
|
||||
/// when timeline was just created and pageserver restarted before upload of index part was completed.
|
||||
/// But currently RemoteStorage interface does not provide this knowledge because it uses
|
||||
/// anyhow::Error as an error type. So this needs a refactoring.
|
||||
///
|
||||
/// In other words we need to yield only complete sets of tenant timelines.
|
||||
/// Failure for one timeline of a tenant should exclude whole tenant from returned hashmap.
|
||||
/// So there are two requirements: keep everything in one futures unordered
|
||||
/// to allow higher concurrency. Mark tenants as failed independently.
|
||||
/// That requires some bookeeping.
|
||||
pub async fn download_index_parts<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
keys: HashSet<ZTenantTimelineId>,
|
||||
) -> HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>>
|
||||
where
|
||||
P: Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
let mut index_parts: HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>> = HashMap::new();
|
||||
|
||||
let mut part_downloads = keys
|
||||
.into_iter()
|
||||
.map(|id| async move { (id, download_index_part(conf, storage, id).await) })
|
||||
.collect::<FuturesUnordered<_>>();
|
||||
|
||||
while let Some((id, part_upload_result)) = part_downloads.next().await {
|
||||
match part_upload_result {
|
||||
Ok(index_part) => {
|
||||
debug!("Successfully fetched index part for {id}");
|
||||
index_parts
|
||||
.entry(id.tenant_id)
|
||||
.or_default()
|
||||
.insert(id.timeline_id, index_part);
|
||||
}
|
||||
Err(e) => error!("Failed to fetch index part for {id}: {e}"),
|
||||
}
|
||||
}
|
||||
|
||||
index_parts
|
||||
}
|
||||
|
||||
/// Note: The function is rather expensive from s3 access point of view, it will execute ceil(N/1000) + N requests.
|
||||
/// At least one request to obtain a list of tenant timelines (more requests is there are more than 1000 timelines).
|
||||
/// And then will attempt to download all index files that belong to these timelines.
|
||||
pub async fn gather_tenant_timelines_index_parts<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
tenant_id: ZTenantId,
|
||||
) -> anyhow::Result<HashMap<ZTimelineId, IndexPart>>
|
||||
where
|
||||
P: RemoteObjectName + Debug + Send + Sync + 'static,
|
||||
S: RemoteStorage<RemoteObjectId = P> + Send + Sync + 'static,
|
||||
{
|
||||
let tenant_path = conf.timelines_path(&tenant_id);
|
||||
let tenant_storage_path = storage.remote_object_id(&tenant_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to get tenant storage path for local path '{}'",
|
||||
tenant_path.display()
|
||||
)
|
||||
})?;
|
||||
let timelines = storage
|
||||
.list_prefixes(Some(tenant_storage_path))
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to list tenant storage path to get remote timelines to download: {}",
|
||||
tenant_id
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut sync_ids = HashSet::new();
|
||||
|
||||
for timeline_remote_storage_key in timelines {
|
||||
let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
|
||||
anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
|
||||
})?;
|
||||
|
||||
let timeline_id: ZTimelineId = object_name
|
||||
.parse()
|
||||
.with_context(|| {
|
||||
format!("failed to parse object name into timeline id for tenant {tenant_id} '{object_name}'")
|
||||
})?;
|
||||
|
||||
sync_ids.insert(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
});
|
||||
}
|
||||
|
||||
download_index_parts(conf, storage, sync_ids)
|
||||
.await
|
||||
.remove(&tenant_id)
|
||||
.ok_or(anyhow::anyhow!(
|
||||
"Missing tenant index parts. This is a bug."
|
||||
))
|
||||
}
|
||||
|
||||
/// Retrieves index data from the remote storage for a given timeline.
|
||||
async fn download_index_part<P, S>(
|
||||
pub async fn download_index_part<P, S>(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &S,
|
||||
sync_id: ZTenantTimelineId,
|
||||
@@ -148,23 +44,13 @@ where
|
||||
index_part_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut index_part_download =
|
||||
storage
|
||||
.download(&part_storage_path)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to open download stream for for storage path {part_storage_path:?}")
|
||||
})?;
|
||||
let mut index_part_bytes = Vec::new();
|
||||
io::copy(
|
||||
&mut index_part_download.download_stream,
|
||||
&mut index_part_bytes,
|
||||
)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to download an index part from storage path {part_storage_path:?}")
|
||||
})?;
|
||||
storage
|
||||
.download(&part_storage_path, &mut index_part_bytes)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Failed to download an index part from storage path {part_storage_path:?}")
|
||||
})?;
|
||||
|
||||
let index_part: IndexPart = serde_json::from_slice(&index_part_bytes).with_context(|| {
|
||||
format!("Failed to deserialize index part file from storage path '{part_storage_path:?}'")
|
||||
@@ -276,19 +162,15 @@ where
|
||||
temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
let mut download = storage
|
||||
.download(&layer_storage_path)
|
||||
|
||||
storage
|
||||
.download(&layer_storage_path, &mut destination_file)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open a download stream for layer with remote storage path '{layer_storage_path:?}'"
|
||||
"Failed to download a layer from storage path '{layer_storage_path:?}'"
|
||||
)
|
||||
})?;
|
||||
io::copy(&mut download.download_stream, &mut destination_file).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to download layer with remote storage path '{layer_storage_path:?}' into file '{}'", temp_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
// Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
|
||||
// A file will not be closed immediately when it goes out of scope if there are any IO operations
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
//! Able to restore itself from the storage index parts, that are located in every timeline's remote directory and contain all data about
|
||||
//! remote timeline layers and its metadata.
|
||||
|
||||
use std::ops::{Deref, DerefMut};
|
||||
use std::{
|
||||
collections::{HashMap, HashSet},
|
||||
path::{Path, PathBuf},
|
||||
@@ -15,10 +14,7 @@ use serde_with::{serde_as, DisplayFromStr};
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use crate::{config::PageServerConf, layered_repository::metadata::TimelineMetadata};
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use utils::{lsn::Lsn, zid::ZTenantTimelineId};
|
||||
|
||||
/// A part of the filesystem path, that needs a root to become a path again.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
|
||||
@@ -45,68 +41,38 @@ impl RelativePath {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct TenantEntry(HashMap<ZTimelineId, RemoteTimeline>);
|
||||
|
||||
impl TenantEntry {
|
||||
pub fn has_in_progress_downloads(&self) -> bool {
|
||||
self.values()
|
||||
.any(|remote_timeline| remote_timeline.awaits_download)
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for TenantEntry {
|
||||
type Target = HashMap<ZTimelineId, RemoteTimeline>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl DerefMut for TenantEntry {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<HashMap<ZTimelineId, RemoteTimeline>> for TenantEntry {
|
||||
fn from(inner: HashMap<ZTimelineId, RemoteTimeline>) -> Self {
|
||||
Self(inner)
|
||||
}
|
||||
}
|
||||
|
||||
/// An index to track tenant files that exist on the remote storage.
|
||||
#[derive(Debug, Clone, Default)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RemoteTimelineIndex {
|
||||
entries: HashMap<ZTenantId, TenantEntry>,
|
||||
timeline_entries: HashMap<ZTenantTimelineId, RemoteTimeline>,
|
||||
}
|
||||
|
||||
/// A wrapper to synchronize the access to the index, should be created and used before dealing with any [`RemoteTimelineIndex`].
|
||||
#[derive(Default)]
|
||||
pub struct RemoteIndex(Arc<RwLock<RemoteTimelineIndex>>);
|
||||
|
||||
impl RemoteIndex {
|
||||
pub fn empty() -> Self {
|
||||
Self(Arc::new(RwLock::new(RemoteTimelineIndex {
|
||||
timeline_entries: HashMap::new(),
|
||||
})))
|
||||
}
|
||||
|
||||
pub fn from_parts(
|
||||
conf: &'static PageServerConf,
|
||||
index_parts: HashMap<ZTenantId, HashMap<ZTimelineId, IndexPart>>,
|
||||
index_parts: HashMap<ZTenantTimelineId, IndexPart>,
|
||||
) -> anyhow::Result<Self> {
|
||||
let mut entries: HashMap<ZTenantId, TenantEntry> = HashMap::new();
|
||||
let mut timeline_entries = HashMap::new();
|
||||
|
||||
for (tenant_id, timelines) in index_parts {
|
||||
for (timeline_id, index_part) in timelines {
|
||||
let timeline_path = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
let remote_timeline =
|
||||
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.context("Failed to restore remote timeline data from index part")?;
|
||||
|
||||
entries
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.insert(timeline_id, remote_timeline);
|
||||
}
|
||||
for (sync_id, index_part) in index_parts {
|
||||
let timeline_path = conf.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
let remote_timeline = RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.context("Failed to restore remote timeline data from index part")?;
|
||||
timeline_entries.insert(sync_id, remote_timeline);
|
||||
}
|
||||
|
||||
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex { entries }))))
|
||||
Ok(Self(Arc::new(RwLock::new(RemoteTimelineIndex {
|
||||
timeline_entries,
|
||||
}))))
|
||||
}
|
||||
|
||||
pub async fn read(&self) -> tokio::sync::RwLockReadGuard<'_, RemoteTimelineIndex> {
|
||||
@@ -125,67 +91,20 @@ impl Clone for RemoteIndex {
|
||||
}
|
||||
|
||||
impl RemoteTimelineIndex {
|
||||
pub fn timeline_entry(
|
||||
&self,
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: &ZTenantTimelineId,
|
||||
) -> Option<&RemoteTimeline> {
|
||||
self.entries.get(tenant_id)?.get(timeline_id)
|
||||
pub fn timeline_entry(&self, id: &ZTenantTimelineId) -> Option<&RemoteTimeline> {
|
||||
self.timeline_entries.get(id)
|
||||
}
|
||||
|
||||
pub fn timeline_entry_mut(
|
||||
&mut self,
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: &ZTenantTimelineId,
|
||||
) -> Option<&mut RemoteTimeline> {
|
||||
self.entries.get_mut(tenant_id)?.get_mut(timeline_id)
|
||||
pub fn timeline_entry_mut(&mut self, id: &ZTenantTimelineId) -> Option<&mut RemoteTimeline> {
|
||||
self.timeline_entries.get_mut(id)
|
||||
}
|
||||
|
||||
pub fn add_timeline_entry(
|
||||
&mut self,
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: ZTenantTimelineId,
|
||||
entry: RemoteTimeline,
|
||||
) {
|
||||
self.entries
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.insert(timeline_id, entry);
|
||||
pub fn add_timeline_entry(&mut self, id: ZTenantTimelineId, entry: RemoteTimeline) {
|
||||
self.timeline_entries.insert(id, entry);
|
||||
}
|
||||
|
||||
pub fn remove_timeline_entry(
|
||||
&mut self,
|
||||
ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}: ZTenantTimelineId,
|
||||
) -> Option<RemoteTimeline> {
|
||||
self.entries
|
||||
.entry(tenant_id)
|
||||
.or_default()
|
||||
.remove(&timeline_id)
|
||||
}
|
||||
|
||||
pub fn tenant_entry(&self, tenant_id: &ZTenantId) -> Option<&TenantEntry> {
|
||||
self.entries.get(tenant_id)
|
||||
}
|
||||
|
||||
pub fn tenant_entry_mut(&mut self, tenant_id: &ZTenantId) -> Option<&mut TenantEntry> {
|
||||
self.entries.get_mut(tenant_id)
|
||||
}
|
||||
|
||||
pub fn add_tenant_entry(&mut self, tenant_id: ZTenantId) -> &mut TenantEntry {
|
||||
self.entries.entry(tenant_id).or_default()
|
||||
}
|
||||
|
||||
pub fn remove_tenant_entry(&mut self, tenant_id: &ZTenantId) -> Option<TenantEntry> {
|
||||
self.entries.remove(tenant_id)
|
||||
pub fn all_sync_ids(&self) -> impl Iterator<Item = ZTenantTimelineId> + '_ {
|
||||
self.timeline_entries.keys().copied()
|
||||
}
|
||||
|
||||
pub fn set_awaits_download(
|
||||
|
||||
@@ -37,7 +37,7 @@ pub mod defaults {
|
||||
pub const DEFAULT_PITR_INTERVAL: &str = "30 days";
|
||||
pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
|
||||
pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
|
||||
pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10_000;
|
||||
}
|
||||
|
||||
/// Per-tenant configuration options
|
||||
|
||||
@@ -4,8 +4,8 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::layered_repository::{load_metadata, LayeredRepository};
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::repository::Repository;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimelineIndex};
|
||||
use crate::repository::{Repository, TimelineSyncStatusUpdate};
|
||||
use crate::storage_sync::index::RemoteIndex;
|
||||
use crate::storage_sync::{self, LocalTimelineInitStatus, SyncStartupData};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
@@ -13,11 +13,11 @@ use crate::timelines::CreateRepo;
|
||||
use crate::walredo::PostgresRedoManager;
|
||||
use crate::{thread_mgr, timelines, walreceiver};
|
||||
use crate::{DatadirTimelineImpl, RepositoryImpl};
|
||||
use anyhow::Context;
|
||||
use anyhow::{bail, Context};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::mpsc;
|
||||
@@ -157,13 +157,7 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
|
||||
// loading a tenant is serious, but it's better to complete the startup and
|
||||
// serve other tenants, than fail completely.
|
||||
error!("Failed to initialize local tenant {tenant_id}: {:?}", err);
|
||||
|
||||
if let Err(err) = set_tenant_state(tenant_id, TenantState::Broken) {
|
||||
error!(
|
||||
"Failed to set tenant state to broken {tenant_id}: {:?}",
|
||||
err
|
||||
);
|
||||
}
|
||||
set_tenant_state(tenant_id, TenantState::Broken)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -171,51 +165,44 @@ pub fn init_tenant_mgr(conf: &'static PageServerConf) -> anyhow::Result<RemoteIn
|
||||
}
|
||||
|
||||
pub enum LocalTimelineUpdate {
|
||||
Detach {
|
||||
id: ZTenantTimelineId,
|
||||
// used to signal to the detach caller that walreceiver successfully terminated for specified id
|
||||
join_confirmation_sender: std::sync::mpsc::Sender<()>,
|
||||
},
|
||||
Attach {
|
||||
id: ZTenantTimelineId,
|
||||
datadir: Arc<DatadirTimelineImpl>,
|
||||
},
|
||||
Detach(ZTenantTimelineId),
|
||||
Attach(ZTenantTimelineId, Arc<DatadirTimelineImpl>),
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for LocalTimelineUpdate {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
Self::Detach { id, .. } => f.debug_tuple("Remove").field(id).finish(),
|
||||
Self::Attach { id, .. } => f.debug_tuple("Add").field(id).finish(),
|
||||
Self::Detach(ttid) => f.debug_tuple("Remove").field(ttid).finish(),
|
||||
Self::Attach(ttid, _) => f.debug_tuple("Add").field(ttid).finish(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Updates tenants' repositories, changing their timelines state in memory.
|
||||
pub fn attach_downloaded_tenants(
|
||||
pub fn apply_timeline_sync_status_updates(
|
||||
conf: &'static PageServerConf,
|
||||
remote_index: &RemoteIndex,
|
||||
sync_status_updates: HashMap<ZTenantId, HashSet<ZTimelineId>>,
|
||||
sync_status_updates: HashMap<ZTenantId, HashMap<ZTimelineId, TimelineSyncStatusUpdate>>,
|
||||
) {
|
||||
if sync_status_updates.is_empty() {
|
||||
debug!("No sync status updates to apply");
|
||||
debug!("no sync status updates to apply");
|
||||
return;
|
||||
}
|
||||
for (tenant_id, downloaded_timelines) in sync_status_updates {
|
||||
info!(
|
||||
"Registering downlloaded timelines for {tenant_id} {} timelines",
|
||||
downloaded_timelines.len()
|
||||
);
|
||||
debug!("Downloaded timelines: {downloaded_timelines:?}");
|
||||
info!(
|
||||
"Applying sync status updates for {} timelines",
|
||||
sync_status_updates.len()
|
||||
);
|
||||
debug!("Sync status updates: {sync_status_updates:?}");
|
||||
|
||||
for (tenant_id, status_updates) in sync_status_updates {
|
||||
let repo = match load_local_repo(conf, tenant_id, remote_index) {
|
||||
Ok(repo) => repo,
|
||||
Err(e) => {
|
||||
error!("Failed to load repo for tenant {tenant_id} Error: {e:?}");
|
||||
error!("Failed to load repo for tenant {tenant_id} Error: {e:?}",);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
match attach_downloaded_tenant(&repo, downloaded_timelines) {
|
||||
match apply_timeline_remote_sync_status_updates(&repo, status_updates) {
|
||||
Ok(()) => info!("successfully applied sync status updates for tenant {tenant_id}"),
|
||||
Err(e) => error!(
|
||||
"Failed to apply timeline sync timeline status updates for tenant {tenant_id}: {e:?}"
|
||||
@@ -243,6 +230,8 @@ pub fn shutdown_all_tenants() {
|
||||
drop(m);
|
||||
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::WalReceiverManager), None, None);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), None, None);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), None, None);
|
||||
|
||||
// Ok, no background threads running anymore. Flush any remaining data in
|
||||
// memory to disk.
|
||||
@@ -341,12 +330,44 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
|
||||
}
|
||||
(TenantState::Idle, TenantState::Active) => {
|
||||
info!("activating tenant {tenant_id}");
|
||||
let compactor_spawn_result = thread_mgr::spawn(
|
||||
ThreadKind::Compactor,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"Compactor thread",
|
||||
false,
|
||||
move || crate::tenant_threads::compact_loop(tenant_id),
|
||||
);
|
||||
if compactor_spawn_result.is_err() {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
m.get_mut(&tenant_id)
|
||||
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
|
||||
.state = old_state;
|
||||
drop(m);
|
||||
}
|
||||
compactor_spawn_result?;
|
||||
|
||||
// Spawn gc and compaction loops. The loops will shut themselves
|
||||
// down when they notice that the tenant is inactive.
|
||||
// TODO maybe use tokio::sync::watch instead?
|
||||
crate::tenant_tasks::start_compaction_loop(tenant_id)?;
|
||||
crate::tenant_tasks::start_gc_loop(tenant_id)?;
|
||||
let gc_spawn_result = thread_mgr::spawn(
|
||||
ThreadKind::GarbageCollector,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"GC thread",
|
||||
false,
|
||||
move || crate::tenant_threads::gc_loop(tenant_id),
|
||||
)
|
||||
.map(|_thread_id| ()) // update the `Result::Ok` type to match the outer function's return signature
|
||||
.with_context(|| format!("Failed to launch GC thread for tenant {tenant_id}"));
|
||||
|
||||
if let Err(e) = &gc_spawn_result {
|
||||
let mut m = tenants_state::write_tenants();
|
||||
m.get_mut(&tenant_id)
|
||||
.with_context(|| format!("Tenant not found for id {tenant_id}"))?
|
||||
.state = old_state;
|
||||
drop(m);
|
||||
error!("Failed to start GC thread for tenant {tenant_id}, stopping its checkpointer thread: {e:?}");
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
||||
return gc_spawn_result;
|
||||
}
|
||||
}
|
||||
(TenantState::Idle, TenantState::Stopping) => {
|
||||
info!("stopping idle tenant {tenant_id}");
|
||||
@@ -358,10 +379,8 @@ pub fn set_tenant_state(tenant_id: ZTenantId, new_state: TenantState) -> anyhow:
|
||||
Some(tenant_id),
|
||||
None,
|
||||
);
|
||||
|
||||
// Wait until all gc/compaction tasks finish
|
||||
let repo = get_repository_for_tenant(tenant_id)?;
|
||||
let _guard = repo.file_lock.write().unwrap();
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::GarbageCollector), Some(tenant_id), None);
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::Compactor), Some(tenant_id), None);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -400,86 +419,33 @@ pub fn get_local_timeline_with_load(
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete_timeline(tenant_id: ZTenantId, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
// Start with the shutdown of timeline tasks (this shuts down the walreceiver)
|
||||
// It is important that we do not take locks here, and do not check whether the timeline exists
|
||||
// because if we hold tenants_state::write_tenants() while awaiting for the threads to join
|
||||
// we cannot create new timelines and tenants, and that can take quite some time,
|
||||
// it can even become stuck due to a bug making whole pageserver unavailable for some operations
|
||||
// so this is the way how we deal with concurrent delete requests: shutdown everythig, wait for confirmation
|
||||
// and then try to actually remove timeline from inmemory state and this is the point when concurrent requests
|
||||
// will synchronize and either fail with the not found error or succeed
|
||||
pub fn detach_timeline(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
// shutdown the timeline threads (this shuts down the walreceiver)
|
||||
thread_mgr::shutdown_threads(None, Some(tenant_id), Some(timeline_id));
|
||||
|
||||
let (sender, receiver) = std::sync::mpsc::channel::<()>();
|
||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
|
||||
id: ZTenantTimelineId::new(tenant_id, timeline_id),
|
||||
join_confirmation_sender: sender,
|
||||
});
|
||||
|
||||
debug!("waiting for wal receiver to shutdown");
|
||||
let _ = receiver.recv();
|
||||
debug!("wal receiver shutdown confirmed");
|
||||
debug!("waiting for threads to shutdown");
|
||||
thread_mgr::shutdown_threads(None, None, Some(timeline_id));
|
||||
debug!("thread shutdown completed");
|
||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||
Some(tenant) => {
|
||||
tenant.repo.delete_timeline(timeline_id)?;
|
||||
tenant
|
||||
.repo
|
||||
.detach_timeline(timeline_id)
|
||||
.context("Failed to detach inmem tenant timeline")?;
|
||||
tenant.local_timelines.remove(&timeline_id);
|
||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach(
|
||||
ZTenantTimelineId::new(tenant_id, timeline_id),
|
||||
));
|
||||
}
|
||||
None => anyhow::bail!("Tenant {tenant_id} not found in local tenant state"),
|
||||
None => bail!("Tenant {tenant_id} not found in local tenant state"),
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn detach_tenant(conf: &'static PageServerConf, tenant_id: ZTenantId) -> anyhow::Result<()> {
|
||||
set_tenant_state(tenant_id, TenantState::Stopping)?;
|
||||
// shutdown the tenant and timeline threads: gc, compaction, page service threads)
|
||||
thread_mgr::shutdown_threads(None, Some(tenant_id), None);
|
||||
|
||||
// FIXME should we protect somehow from starting new threads/walreceivers when tenant is in stopping state?
|
||||
// send stop signal to wal receiver and collect join handles while holding the lock
|
||||
let walreceiver_join_handles = {
|
||||
let tenants = tenants_state::write_tenants();
|
||||
let tenant = tenants.get(&tenant_id).context("tenant not found")?;
|
||||
let mut walreceiver_join_handles = Vec::with_capacity(tenant.local_timelines.len());
|
||||
for timeline_id in tenant.local_timelines.keys() {
|
||||
let (sender, receiver) = std::sync::mpsc::channel::<()>();
|
||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Detach {
|
||||
id: ZTenantTimelineId::new(tenant_id, *timeline_id),
|
||||
join_confirmation_sender: sender,
|
||||
});
|
||||
walreceiver_join_handles.push((*timeline_id, receiver));
|
||||
}
|
||||
// drop the tenants lock
|
||||
walreceiver_join_handles
|
||||
};
|
||||
|
||||
// wait for wal receivers to stop without holding the lock, because walreceiver
|
||||
// will attempt to change tenant state which is protected by the same global tenants lock.
|
||||
// TODO do we need a timeout here? how to handle it?
|
||||
// recv_timeout is broken: https://github.com/rust-lang/rust/issues/94518#issuecomment-1057440631
|
||||
// need to use crossbeam-channel
|
||||
for (timeline_id, join_handle) in walreceiver_join_handles {
|
||||
info!("waiting for wal receiver to shutdown timeline_id {timeline_id}");
|
||||
join_handle.recv().context("failed to join walreceiver")?;
|
||||
info!("wal receiver shutdown confirmed timeline_id {timeline_id}");
|
||||
}
|
||||
|
||||
tenants_state::write_tenants().remove(&tenant_id);
|
||||
|
||||
// If removal fails there will be no way to successfully retry detach,
|
||||
// because tenant no longer exists in in memory map. And it needs to be removed from it
|
||||
// before we remove files because it contains references to repository
|
||||
// which references ephemeral files which are deleted on drop. So if we keep these references
|
||||
// code will attempt to remove files which no longer exist. This can be fixed by having shutdown
|
||||
// mechanism for repository that will clean temporary data to avoid any references to ephemeral files
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
std::fs::remove_dir_all(&local_tenant_directory).with_context(|| {
|
||||
let local_timeline_directory = conf.timeline_path(&timeline_id, &tenant_id);
|
||||
std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove local timeline directory '{}'",
|
||||
local_tenant_directory.display()
|
||||
local_timeline_directory.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
@@ -500,10 +466,10 @@ fn load_local_timeline(
|
||||
));
|
||||
page_tline.init_logical_size()?;
|
||||
|
||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach {
|
||||
id: ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
|
||||
datadir: Arc::clone(&page_tline),
|
||||
});
|
||||
tenants_state::try_send_timeline_update(LocalTimelineUpdate::Attach(
|
||||
ZTenantTimelineId::new(repo.tenant_id(), timeline_id),
|
||||
Arc::clone(&page_tline),
|
||||
));
|
||||
|
||||
Ok(page_tline)
|
||||
}
|
||||
@@ -513,27 +479,15 @@ fn load_local_timeline(
|
||||
pub struct TenantInfo {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub id: ZTenantId,
|
||||
pub state: Option<TenantState>,
|
||||
pub has_in_progress_downloads: Option<bool>,
|
||||
pub state: TenantState,
|
||||
}
|
||||
|
||||
pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
|
||||
pub fn list_tenants() -> Vec<TenantInfo> {
|
||||
tenants_state::read_tenants()
|
||||
.iter()
|
||||
.map(|(id, tenant)| {
|
||||
let has_in_progress_downloads = remote_index
|
||||
.tenant_entry(id)
|
||||
.map(|entry| entry.has_in_progress_downloads());
|
||||
|
||||
if has_in_progress_downloads.is_none() {
|
||||
error!("timeline is not found in remote index while it is present in the tenants registry")
|
||||
}
|
||||
|
||||
TenantInfo {
|
||||
id: *id,
|
||||
state: Some(tenant.state),
|
||||
has_in_progress_downloads,
|
||||
}
|
||||
.map(|(id, tenant)| TenantInfo {
|
||||
id: *id,
|
||||
state: tenant.state,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
@@ -545,73 +499,74 @@ pub fn list_tenants(remote_index: &RemoteTimelineIndex) -> Vec<TenantInfo> {
|
||||
/// A timeline is categorized as broken when any of following conditions is true:
|
||||
/// - failed to load the timeline's metadata
|
||||
/// - the timeline's disk consistent LSN is zero
|
||||
fn check_broken_timeline(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
) -> anyhow::Result<()> {
|
||||
let metadata =
|
||||
load_metadata(conf, timeline_id, tenant_id).context("failed to load metadata")?;
|
||||
fn check_broken_timeline(repo: &LayeredRepository, timeline_id: ZTimelineId) -> anyhow::Result<()> {
|
||||
let metadata = load_metadata(repo.conf, timeline_id, repo.tenant_id())
|
||||
.context("failed to load metadata")?;
|
||||
|
||||
// A timeline with zero disk consistent LSN can happen when the page server
|
||||
// failed to checkpoint the timeline import data when creating that timeline.
|
||||
if metadata.disk_consistent_lsn() == Lsn::INVALID {
|
||||
anyhow::bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
|
||||
bail!("Timeline {timeline_id} has a zero disk consistent LSN.");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Note: all timelines are attached at once if and only if all of them are locally complete
|
||||
fn init_local_repository(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: ZTenantId,
|
||||
local_timeline_init_statuses: HashMap<ZTimelineId, LocalTimelineInitStatus>,
|
||||
remote_index: &RemoteIndex,
|
||||
) -> anyhow::Result<(), anyhow::Error> {
|
||||
let mut timelines_to_attach = HashSet::new();
|
||||
// initialize local tenant
|
||||
let repo = load_local_repo(conf, tenant_id, remote_index)
|
||||
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
|
||||
|
||||
let mut status_updates = HashMap::with_capacity(local_timeline_init_statuses.len());
|
||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||
match init_status {
|
||||
LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {timeline_id} for tenant {tenant_id} is locally complete, registering it in repository");
|
||||
check_broken_timeline(conf, tenant_id, timeline_id)
|
||||
.context("found broken timeline")?;
|
||||
timelines_to_attach.insert(timeline_id);
|
||||
if let Err(err) = check_broken_timeline(&repo, timeline_id) {
|
||||
info!(
|
||||
"Found a broken timeline {timeline_id} (err={err:?}), skip registering it in repository"
|
||||
);
|
||||
} else {
|
||||
status_updates.insert(timeline_id, TimelineSyncStatusUpdate::Downloaded);
|
||||
}
|
||||
}
|
||||
LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
"timeline {tenant_id} for tenant {timeline_id} needs sync, \
|
||||
so skipped for adding into repository until sync is finished"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// initialize local tenant
|
||||
let repo = load_local_repo(conf, tenant_id, remote_index)
|
||||
.with_context(|| format!("Failed to load repo for tenant {tenant_id}"))?;
|
||||
|
||||
// Lets fail here loudly to be on the safe side.
|
||||
// XXX: It may be a better api to actually distinguish between repository startup
|
||||
// and processing of newly downloaded timelines.
|
||||
attach_downloaded_tenant(&repo, timelines_to_attach)
|
||||
apply_timeline_remote_sync_status_updates(&repo, status_updates)
|
||||
.with_context(|| format!("Failed to bootstrap timelines for tenant {tenant_id}"))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn attach_downloaded_tenant(
|
||||
fn apply_timeline_remote_sync_status_updates(
|
||||
repo: &LayeredRepository,
|
||||
downloaded_timelines: HashSet<ZTimelineId>,
|
||||
status_updates: HashMap<ZTimelineId, TimelineSyncStatusUpdate>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut registration_queue = Vec::with_capacity(downloaded_timelines.len());
|
||||
let mut registration_queue = Vec::with_capacity(status_updates.len());
|
||||
|
||||
// first need to register the in-mem representations, to avoid missing ancestors during the local disk data registration
|
||||
for timeline_id in downloaded_timelines {
|
||||
repo.attach_timeline(timeline_id).with_context(|| {
|
||||
format!("Failed to load timeline {timeline_id} into in-memory repository")
|
||||
})?;
|
||||
registration_queue.push(timeline_id);
|
||||
for (timeline_id, status_update) in status_updates {
|
||||
repo.apply_timeline_remote_sync_status_update(timeline_id, status_update)
|
||||
.with_context(|| {
|
||||
format!("Failed to load timeline {timeline_id} into in-memory repository")
|
||||
})?;
|
||||
match status_update {
|
||||
TimelineSyncStatusUpdate::Downloaded => registration_queue.push(timeline_id),
|
||||
}
|
||||
}
|
||||
|
||||
for timeline_id in registration_queue {
|
||||
@@ -619,7 +574,7 @@ fn attach_downloaded_tenant(
|
||||
match tenants_state::write_tenants().get_mut(&tenant_id) {
|
||||
Some(tenant) => match tenant.local_timelines.entry(timeline_id) {
|
||||
Entry::Occupied(_) => {
|
||||
anyhow::bail!("Local timeline {timeline_id} already registered")
|
||||
bail!("Local timeline {timeline_id} already registered")
|
||||
}
|
||||
Entry::Vacant(v) => {
|
||||
v.insert(load_local_timeline(repo, timeline_id).with_context(|| {
|
||||
@@ -627,7 +582,7 @@ fn attach_downloaded_tenant(
|
||||
})?);
|
||||
}
|
||||
},
|
||||
None => anyhow::bail!(
|
||||
None => bail!(
|
||||
"Tenant {} not found in local tenant state",
|
||||
repo.tenant_id()
|
||||
),
|
||||
|
||||
@@ -1,286 +0,0 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as compaction and GC
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::ops::ControlFlow;
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::repository::Repository;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use crate::{tenant_mgr, thread_mgr};
|
||||
use anyhow::{self, Context};
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::StreamExt;
|
||||
use metrics::{register_int_counter_vec, IntCounterVec};
|
||||
use once_cell::sync::{Lazy, OnceCell};
|
||||
use tokio::sync::mpsc;
|
||||
use tokio::sync::watch;
|
||||
use tracing::*;
|
||||
use utils::zid::ZTenantId;
|
||||
|
||||
static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
|
||||
register_int_counter_vec!(
|
||||
"pageserver_tenant_task_events",
|
||||
"Number of task start/stop/fail events.",
|
||||
&["event"],
|
||||
)
|
||||
.expect("Failed to register tenant_task_events metric")
|
||||
});
|
||||
|
||||
///
|
||||
/// Compaction task's main loop
|
||||
///
|
||||
async fn compaction_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
// Run blocking part of the task
|
||||
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
|
||||
// Break if tenant is not active
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
|
||||
// Break if we're not allowed to write to disk
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
// TODO do this inside repo.compaction_iteration instead.
|
||||
let _guard = match repo.file_lock.try_read() {
|
||||
Ok(g) => g,
|
||||
Err(_) => return Ok(ControlFlow::Break(())),
|
||||
};
|
||||
|
||||
// Run compaction
|
||||
let compaction_period = repo.get_compaction_period();
|
||||
repo.compaction_iteration()?;
|
||||
Ok(ControlFlow::Continue(compaction_period))
|
||||
})
|
||||
.await;
|
||||
|
||||
// Decide whether to sleep or break
|
||||
let sleep_duration = match period {
|
||||
Ok(Ok(ControlFlow::Continue(period))) => period,
|
||||
Ok(Ok(ControlFlow::Break(()))) => break,
|
||||
Ok(Err(e)) => {
|
||||
error!("Compaction failed, retrying: {}", e);
|
||||
Duration::from_secs(2)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Compaction join error, retrying: {}", e);
|
||||
Duration::from_secs(2)
|
||||
}
|
||||
};
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = cancel.changed() => {
|
||||
trace!("received cancellation request");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
}
|
||||
}
|
||||
|
||||
trace!(
|
||||
"compaction loop stopped. State is {:?}",
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
}
|
||||
|
||||
static START_GC_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||
static START_COMPACTION_LOOP: OnceCell<mpsc::Sender<ZTenantId>> = OnceCell::new();
|
||||
|
||||
/// Spawn a task that will periodically schedule garbage collection until
|
||||
/// the tenant becomes inactive. This should be called on tenant
|
||||
/// activation.
|
||||
pub fn start_gc_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
|
||||
START_GC_LOOP
|
||||
.get()
|
||||
.context("Failed to get START_GC_LOOP")?
|
||||
.blocking_send(tenantid)
|
||||
.context("Failed to send to START_GC_LOOP channel")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn a task that will periodically schedule compaction until
|
||||
/// the tenant becomes inactive. This should be called on tenant
|
||||
/// activation.
|
||||
pub fn start_compaction_loop(tenantid: ZTenantId) -> anyhow::Result<()> {
|
||||
START_COMPACTION_LOOP
|
||||
.get()
|
||||
.context("failed to get START_COMPACTION_LOOP")?
|
||||
.blocking_send(tenantid)
|
||||
.context("failed to send to START_COMPACTION_LOOP")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Spawn the TenantTaskManager
|
||||
/// This needs to be called before start_gc_loop or start_compaction_loop
|
||||
pub fn init_tenant_task_pool() -> anyhow::Result<()> {
|
||||
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("tenant-task-worker")
|
||||
.enable_all()
|
||||
.build()?;
|
||||
|
||||
let (gc_send, mut gc_recv) = mpsc::channel::<ZTenantId>(100);
|
||||
START_GC_LOOP
|
||||
.set(gc_send)
|
||||
.expect("Failed to set START_GC_LOOP");
|
||||
|
||||
let (compaction_send, mut compaction_recv) = mpsc::channel::<ZTenantId>(100);
|
||||
START_COMPACTION_LOOP
|
||||
.set(compaction_send)
|
||||
.expect("Failed to set START_COMPACTION_LOOP");
|
||||
|
||||
// TODO this is getting repetitive
|
||||
let mut gc_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
|
||||
let mut compaction_loops = HashMap::<ZTenantId, watch::Sender<()>>::new();
|
||||
|
||||
thread_mgr::spawn(
|
||||
ThreadKind::TenantTaskManager,
|
||||
None,
|
||||
None,
|
||||
"Tenant task manager main thread",
|
||||
true,
|
||||
move || {
|
||||
runtime.block_on(async move {
|
||||
let mut futures = FuturesUnordered::new();
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = thread_mgr::shutdown_watcher() => {
|
||||
// Send cancellation to all tasks
|
||||
for (_, cancel) in gc_loops.drain() {
|
||||
cancel.send(()).ok();
|
||||
}
|
||||
for (_, cancel) in compaction_loops.drain() {
|
||||
cancel.send(()).ok();
|
||||
}
|
||||
|
||||
// Exit after all tasks finish
|
||||
while let Some(result) = futures.next().await {
|
||||
match result {
|
||||
Ok(()) => {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
},
|
||||
Err(e) => {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
|
||||
error!("loop join error {}", e)
|
||||
},
|
||||
}
|
||||
}
|
||||
break;
|
||||
},
|
||||
tenantid = gc_recv.recv() => {
|
||||
let tenantid = tenantid.expect("Gc task channel closed unexpectedly");
|
||||
|
||||
// Spawn new task, request cancellation of the old one if exists
|
||||
let (cancel_send, cancel_recv) = watch::channel(());
|
||||
let handle = tokio::spawn(gc_loop(tenantid, cancel_recv)
|
||||
.instrument(info_span!("gc loop", tenant = %tenantid)));
|
||||
if let Some(old_cancel_send) = gc_loops.insert(tenantid, cancel_send) {
|
||||
old_cancel_send.send(()).ok();
|
||||
}
|
||||
|
||||
// Update metrics, remember handle
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
futures.push(handle);
|
||||
},
|
||||
tenantid = compaction_recv.recv() => {
|
||||
let tenantid = tenantid.expect("Compaction task channel closed unexpectedly");
|
||||
|
||||
// Spawn new task, request cancellation of the old one if exists
|
||||
let (cancel_send, cancel_recv) = watch::channel(());
|
||||
let handle = tokio::spawn(compaction_loop(tenantid, cancel_recv)
|
||||
.instrument(info_span!("compaction loop", tenant = %tenantid)));
|
||||
if let Some(old_cancel_send) = compaction_loops.insert(tenantid, cancel_send) {
|
||||
old_cancel_send.send(()).ok();
|
||||
}
|
||||
|
||||
// Update metrics, remember handle
|
||||
TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
|
||||
futures.push(handle);
|
||||
},
|
||||
result = futures.next() => {
|
||||
// Log and count any unhandled panics
|
||||
match result {
|
||||
Some(Ok(())) => {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc();
|
||||
},
|
||||
Some(Err(e)) => {
|
||||
TENANT_TASK_EVENTS.with_label_values(&["panic"]).inc();
|
||||
error!("loop join error {}", e)
|
||||
},
|
||||
None => {},
|
||||
};
|
||||
},
|
||||
}
|
||||
}
|
||||
});
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// GC task's main loop
|
||||
///
|
||||
async fn gc_loop(tenantid: ZTenantId, mut cancel: watch::Receiver<()>) {
|
||||
loop {
|
||||
trace!("waking up");
|
||||
|
||||
// Run blocking part of the task
|
||||
let period: Result<Result<_, anyhow::Error>, _> = tokio::task::spawn_blocking(move || {
|
||||
// Break if tenant is not active
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
|
||||
// Break if we're not allowed to write to disk
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
// TODO do this inside repo.gc_iteration instead.
|
||||
let _guard = match repo.file_lock.try_read() {
|
||||
Ok(g) => g,
|
||||
Err(_) => return Ok(ControlFlow::Break(())),
|
||||
};
|
||||
|
||||
// Run gc
|
||||
let gc_period = repo.get_gc_period();
|
||||
let gc_horizon = repo.get_gc_horizon();
|
||||
if gc_horizon > 0 {
|
||||
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
|
||||
}
|
||||
|
||||
Ok(ControlFlow::Continue(gc_period))
|
||||
})
|
||||
.await;
|
||||
|
||||
// Decide whether to sleep or break
|
||||
let sleep_duration = match period {
|
||||
Ok(Ok(ControlFlow::Continue(period))) => period,
|
||||
Ok(Ok(ControlFlow::Break(()))) => break,
|
||||
Ok(Err(e)) => {
|
||||
error!("Gc failed, retrying: {}", e);
|
||||
Duration::from_secs(2)
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Gc join error, retrying: {}", e);
|
||||
Duration::from_secs(2)
|
||||
}
|
||||
};
|
||||
|
||||
// Sleep
|
||||
tokio::select! {
|
||||
_ = cancel.changed() => {
|
||||
trace!("received cancellation request");
|
||||
break;
|
||||
},
|
||||
_ = tokio::time::sleep(sleep_duration) => {},
|
||||
}
|
||||
}
|
||||
trace!(
|
||||
"GC loop stopped. State is {:?}",
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
}
|
||||
79
pageserver/src/tenant_threads.rs
Normal file
79
pageserver/src/tenant_threads.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
//! This module contains functions to serve per-tenant background processes,
|
||||
//! such as compaction and GC
|
||||
use crate::repository::Repository;
|
||||
use crate::tenant_mgr;
|
||||
use crate::tenant_mgr::TenantState;
|
||||
use anyhow::Result;
|
||||
use std::time::Duration;
|
||||
use tracing::*;
|
||||
use utils::zid::ZTenantId;
|
||||
|
||||
///
|
||||
/// Compaction thread's main loop
|
||||
///
|
||||
pub fn compact_loop(tenantid: ZTenantId) -> Result<()> {
|
||||
if let Err(err) = compact_loop_ext(tenantid) {
|
||||
error!("compact loop terminated with error: {:?}", err);
|
||||
Err(err)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn compact_loop_ext(tenantid: ZTenantId) -> Result<()> {
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
}
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let compaction_period = repo.get_compaction_period();
|
||||
|
||||
std::thread::sleep(compaction_period);
|
||||
trace!("compaction thread for tenant {} waking up", tenantid);
|
||||
|
||||
// Compact timelines
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
repo.compaction_iteration()?;
|
||||
}
|
||||
|
||||
trace!(
|
||||
"compaction thread stopped for tenant {} state is {:?}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// GC thread's main loop
|
||||
///
|
||||
pub fn gc_loop(tenantid: ZTenantId) -> Result<()> {
|
||||
loop {
|
||||
if tenant_mgr::get_tenant_state(tenantid) != Some(TenantState::Active) {
|
||||
break;
|
||||
}
|
||||
|
||||
trace!("gc thread for tenant {} waking up", tenantid);
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
let gc_horizon = repo.get_gc_horizon();
|
||||
// Garbage collect old files that are not needed for PITR anymore
|
||||
if gc_horizon > 0 {
|
||||
repo.gc_iteration(None, gc_horizon, repo.get_pitr_interval(), false)?;
|
||||
}
|
||||
|
||||
// TODO Write it in more adequate way using
|
||||
// condvar.wait_timeout() or something
|
||||
let mut sleep_time = repo.get_gc_period().as_secs();
|
||||
while sleep_time > 0 && tenant_mgr::get_tenant_state(tenantid) == Some(TenantState::Active)
|
||||
{
|
||||
sleep_time -= 1;
|
||||
std::thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
}
|
||||
trace!(
|
||||
"GC thread stopped for tenant {} state is {:?}",
|
||||
tenantid,
|
||||
tenant_mgr::get_tenant_state(tenantid)
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
@@ -94,8 +94,11 @@ pub enum ThreadKind {
|
||||
// Main walreceiver manager thread that ensures that every timeline spawns a connection to safekeeper, to fetch WAL.
|
||||
WalReceiverManager,
|
||||
|
||||
// Thread that schedules new compaction and gc jobs
|
||||
TenantTaskManager,
|
||||
// Thread that handles compaction of all timelines for a tenant.
|
||||
Compactor,
|
||||
|
||||
// Thread that handles GC of a tenant
|
||||
GarbageCollector,
|
||||
|
||||
// Thread that flushes frozen in-memory layers to disk
|
||||
LayerFlushThread,
|
||||
|
||||
@@ -202,7 +202,7 @@ pub fn create_repo(
|
||||
// anymore, but I think that could still happen.
|
||||
let wal_redo_manager = Arc::new(crate::walredo::DummyRedoManager {});
|
||||
|
||||
(wal_redo_manager as _, RemoteIndex::default())
|
||||
(wal_redo_manager as _, RemoteIndex::empty())
|
||||
}
|
||||
};
|
||||
|
||||
@@ -347,7 +347,7 @@ pub(crate) fn create_timeline(
|
||||
tenant_id: ZTenantId,
|
||||
new_timeline_id: Option<ZTimelineId>,
|
||||
ancestor_timeline_id: Option<ZTimelineId>,
|
||||
mut ancestor_start_lsn: Option<Lsn>,
|
||||
ancestor_start_lsn: Option<Lsn>,
|
||||
) -> Result<Option<TimelineInfo>> {
|
||||
let new_timeline_id = new_timeline_id.unwrap_or_else(ZTimelineId::generate);
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
@@ -357,35 +357,41 @@ pub(crate) fn create_timeline(
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut start_lsn = ancestor_start_lsn.unwrap_or(Lsn(0));
|
||||
|
||||
let new_timeline_info = match ancestor_timeline_id {
|
||||
Some(ancestor_timeline_id) => {
|
||||
let ancestor_timeline = repo
|
||||
.get_timeline_load(ancestor_timeline_id)
|
||||
.context("Cannot branch off the timeline that's not present locally")?;
|
||||
|
||||
if let Some(lsn) = ancestor_start_lsn.as_mut() {
|
||||
if start_lsn == Lsn(0) {
|
||||
// Find end of WAL on the old timeline
|
||||
let end_of_wal = ancestor_timeline.get_last_record_lsn();
|
||||
info!("branching at end of WAL: {}", end_of_wal);
|
||||
start_lsn = end_of_wal;
|
||||
} else {
|
||||
// Wait for the WAL to arrive and be processed on the parent branch up
|
||||
// to the requested branch point. The repository code itself doesn't
|
||||
// require it, but if we start to receive WAL on the new timeline,
|
||||
// decoding the new WAL might need to look up previous pages, relation
|
||||
// sizes etc. and that would get confused if the previous page versions
|
||||
// are not in the repository yet.
|
||||
*lsn = lsn.align();
|
||||
ancestor_timeline.wait_lsn(*lsn)?;
|
||||
ancestor_timeline.wait_lsn(start_lsn)?;
|
||||
}
|
||||
start_lsn = start_lsn.align();
|
||||
|
||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||
if ancestor_ancestor_lsn > *lsn {
|
||||
// can we safely just branch from the ancestor instead?
|
||||
anyhow::bail!(
|
||||
let ancestor_ancestor_lsn = ancestor_timeline.get_ancestor_lsn();
|
||||
if ancestor_ancestor_lsn > start_lsn {
|
||||
// can we safely just branch from the ancestor instead?
|
||||
anyhow::bail!(
|
||||
"invalid start lsn {} for ancestor timeline {}: less than timeline ancestor lsn {}",
|
||||
lsn,
|
||||
start_lsn,
|
||||
ancestor_timeline_id,
|
||||
ancestor_ancestor_lsn,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)?;
|
||||
repo.branch_timeline(ancestor_timeline_id, new_timeline_id, start_lsn)?;
|
||||
// load the timeline into memory
|
||||
let loaded_timeline =
|
||||
tenant_mgr::get_local_timeline_with_load(tenant_id, new_timeline_id)?;
|
||||
|
||||
@@ -91,6 +91,7 @@ pub fn init_wal_receiver_main_thread(
|
||||
|
||||
let runtime = tokio::runtime::Builder::new_multi_thread()
|
||||
.thread_name("wal-receiver-runtime-thread")
|
||||
.worker_threads(40)
|
||||
.enable_all()
|
||||
.on_thread_start(|| IS_WAL_RECEIVER.with(|c| c.set(true)))
|
||||
.build()
|
||||
@@ -177,7 +178,7 @@ async fn shutdown_all_wal_connections(
|
||||
/// That may lead to certain events not being observed by the listener.
|
||||
#[derive(Debug)]
|
||||
struct TaskHandle<E> {
|
||||
handle: JoinHandle<Result<(), String>>,
|
||||
handle: JoinHandle<()>,
|
||||
events_receiver: watch::Receiver<TaskEvent<E>>,
|
||||
cancellation: watch::Sender<()>,
|
||||
}
|
||||
@@ -204,8 +205,8 @@ impl<E: Clone> TaskHandle<E> {
|
||||
|
||||
let sender = Arc::clone(&events_sender);
|
||||
let handle = tokio::task::spawn(async move {
|
||||
events_sender.send(TaskEvent::Started).ok();
|
||||
task(sender, cancellation_receiver).await
|
||||
let task_result = task(sender, cancellation_receiver).await;
|
||||
events_sender.send(TaskEvent::End(task_result)).ok();
|
||||
});
|
||||
|
||||
TaskHandle {
|
||||
@@ -215,16 +216,6 @@ impl<E: Clone> TaskHandle<E> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn next_task_event(&mut self) -> TaskEvent<E> {
|
||||
select! {
|
||||
next_task_event = self.events_receiver.changed() => match next_task_event {
|
||||
Ok(()) => self.events_receiver.borrow().clone(),
|
||||
Err(_task_channel_part_dropped) => join_on_handle(&mut self.handle).await,
|
||||
},
|
||||
task_completion_result = join_on_handle(&mut self.handle) => task_completion_result,
|
||||
}
|
||||
}
|
||||
|
||||
/// Aborts current task, waiting for it to finish.
|
||||
async fn shutdown(self) {
|
||||
self.cancellation.send(()).ok();
|
||||
@@ -234,19 +225,6 @@ impl<E: Clone> TaskHandle<E> {
|
||||
}
|
||||
}
|
||||
|
||||
async fn join_on_handle<E>(handle: &mut JoinHandle<Result<(), String>>) -> TaskEvent<E> {
|
||||
match handle.await {
|
||||
Ok(task_result) => TaskEvent::End(task_result),
|
||||
Err(e) => {
|
||||
if e.is_cancelled() {
|
||||
TaskEvent::End(Ok(()))
|
||||
} else {
|
||||
TaskEvent::End(Err(format!("WAL receiver task panicked: {e}")))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A step to process timeline attach/detach events to enable/disable the corresponding WAL receiver machinery.
|
||||
/// In addition to WAL streaming management, the step ensures that corresponding tenant has its service threads enabled or disabled.
|
||||
/// This is done here, since only walreceiver knows when a certain tenant has no streaming enabled.
|
||||
@@ -264,10 +242,7 @@ async fn wal_receiver_main_thread_loop_step<'a>(
|
||||
info!("Processing timeline update: {update:?}");
|
||||
match update {
|
||||
// Timeline got detached, stop all related tasks and remove public timeline data.
|
||||
LocalTimelineUpdate::Detach {
|
||||
id,
|
||||
join_confirmation_sender,
|
||||
} => {
|
||||
LocalTimelineUpdate::Detach(id) => {
|
||||
match local_timeline_wal_receivers.get_mut(&id.tenant_id) {
|
||||
Some(wal_receivers) => {
|
||||
if let hash_map::Entry::Occupied(o) = wal_receivers.entry(id.timeline_id) {
|
||||
@@ -283,48 +258,44 @@ async fn wal_receiver_main_thread_loop_step<'a>(
|
||||
};
|
||||
{
|
||||
WAL_RECEIVER_ENTRIES.write().await.remove(&id);
|
||||
if let Err(e) = join_confirmation_sender.send(()) {
|
||||
warn!("cannot send wal_receiver shutdown confirmation {e}")
|
||||
} else {
|
||||
info!("confirm walreceiver shutdown for {id}");
|
||||
}
|
||||
}
|
||||
}
|
||||
// Timeline got attached, retrieve all necessary information to start its broker loop and maintain this loop endlessly.
|
||||
LocalTimelineUpdate::Attach { id, datadir } => {
|
||||
LocalTimelineUpdate::Attach(new_id, new_timeline) => {
|
||||
let timeline_connection_managers = local_timeline_wal_receivers
|
||||
.entry(id.tenant_id)
|
||||
.entry(new_id.tenant_id)
|
||||
.or_default();
|
||||
|
||||
if timeline_connection_managers.is_empty() {
|
||||
if let Err(e) = change_tenant_state(id.tenant_id, TenantState::Active).await
|
||||
if let Err(e) =
|
||||
change_tenant_state(new_id.tenant_id, TenantState::Active).await
|
||||
{
|
||||
error!("Failed to make tenant active for id {id}: {e:#}");
|
||||
error!("Failed to make tenant active for id {new_id}: {e:#}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
let vacant_connection_manager_entry =
|
||||
match timeline_connection_managers.entry(id.timeline_id) {
|
||||
match timeline_connection_managers.entry(new_id.timeline_id) {
|
||||
hash_map::Entry::Occupied(_) => {
|
||||
debug!("Attepted to readd an existing timeline {id}, ignoring");
|
||||
debug!("Attepted to readd an existing timeline {new_id}, ignoring");
|
||||
return;
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => v,
|
||||
};
|
||||
|
||||
let (wal_connect_timeout, lagging_wal_timeout, max_lsn_wal_lag) =
|
||||
match fetch_tenant_settings(id.tenant_id).await {
|
||||
match fetch_tenant_settings(new_id.tenant_id).await {
|
||||
Ok(settings) => settings,
|
||||
Err(e) => {
|
||||
error!("Failed to fetch tenant settings for id {id}: {e:#}");
|
||||
error!("Failed to fetch tenant settings for id {new_id}: {e:#}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
{
|
||||
WAL_RECEIVER_ENTRIES.write().await.insert(
|
||||
id,
|
||||
new_id,
|
||||
WalReceiverEntry {
|
||||
wal_producer_connstr: None,
|
||||
last_received_msg_lsn: None,
|
||||
@@ -335,10 +306,10 @@ async fn wal_receiver_main_thread_loop_step<'a>(
|
||||
|
||||
vacant_connection_manager_entry.insert(
|
||||
connection_manager::spawn_connection_manager_task(
|
||||
id,
|
||||
new_id,
|
||||
broker_prefix.to_owned(),
|
||||
etcd_client.clone(),
|
||||
datadir,
|
||||
new_timeline,
|
||||
wal_connect_timeout,
|
||||
lagging_wal_timeout,
|
||||
max_lsn_wal_lag,
|
||||
|
||||
@@ -104,29 +104,49 @@ async fn connection_manager_loop_step(
|
||||
|
||||
Some(wal_connection_update) = async {
|
||||
match walreceiver_state.wal_connection.as_mut() {
|
||||
Some(wal_connection) => Some(wal_connection.connection_task.next_task_event().await),
|
||||
Some(wal_connection) => {
|
||||
let receiver = &mut wal_connection.connection_task.events_receiver;
|
||||
Some(match receiver.changed().await {
|
||||
Ok(()) => receiver.borrow().clone(),
|
||||
Err(_cancellation_error) => TaskEvent::End(Ok(())),
|
||||
})
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
} => {
|
||||
let wal_connection = walreceiver_state.wal_connection.as_mut().expect("Should have a connection, as checked by the corresponding select! guard");
|
||||
match &wal_connection_update {
|
||||
TaskEvent::Started => {
|
||||
wal_connection.latest_connection_update = Utc::now().naive_utc();
|
||||
*walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0) += 1;
|
||||
},
|
||||
TaskEvent::NewEvent(replication_feedback) => {
|
||||
wal_connection.latest_connection_update = DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc();
|
||||
// reset connection attempts here only, the only place where both nodes
|
||||
// explicitly confirmn with replication feedback that they are connected to each other
|
||||
walreceiver_state.wal_connection_attempts.remove(&wal_connection.sk_id);
|
||||
},
|
||||
let (connection_update, reset_connection_attempts) = match &wal_connection_update {
|
||||
TaskEvent::Started => (Some(Utc::now().naive_utc()), true),
|
||||
TaskEvent::NewEvent(replication_feedback) => (Some(DateTime::<Local>::from(replication_feedback.ps_replytime).naive_utc()), true),
|
||||
TaskEvent::End(end_result) => {
|
||||
match end_result {
|
||||
Ok(()) => debug!("WAL receiving task finished"),
|
||||
Err(e) => warn!("WAL receiving task failed: {e}"),
|
||||
let should_reset_connection_attempts = match end_result {
|
||||
Ok(()) => {
|
||||
debug!("WAL receiving task finished");
|
||||
true
|
||||
},
|
||||
Err(e) => {
|
||||
warn!("WAL receiving task failed: {e}");
|
||||
false
|
||||
},
|
||||
};
|
||||
walreceiver_state.wal_connection = None;
|
||||
(None, should_reset_connection_attempts)
|
||||
},
|
||||
};
|
||||
|
||||
if let Some(connection_update) = connection_update {
|
||||
match &mut walreceiver_state.wal_connection {
|
||||
Some(wal_connection) => {
|
||||
wal_connection.latest_connection_update = connection_update;
|
||||
|
||||
let attempts_entry = walreceiver_state.wal_connection_attempts.entry(wal_connection.sk_id).or_insert(0);
|
||||
if reset_connection_attempts {
|
||||
*attempts_entry = 0;
|
||||
} else {
|
||||
*attempts_entry += 1;
|
||||
}
|
||||
},
|
||||
None => error!("Received connection update for WAL connection that is not active, update: {wal_connection_update:?}"),
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -386,8 +406,10 @@ impl WalreceiverState {
|
||||
Some(existing_wal_connection) => {
|
||||
let connected_sk_node = existing_wal_connection.sk_id;
|
||||
|
||||
let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) =
|
||||
self.select_connection_candidate(Some(connected_sk_node))?;
|
||||
let (new_sk_id, new_safekeeper_etcd_data, new_wal_producer_connstr) = self
|
||||
.applicable_connection_candidates()
|
||||
.filter(|&(sk_id, _, _)| sk_id != connected_sk_node)
|
||||
.max_by_key(|(_, info, _)| info.commit_lsn)?;
|
||||
|
||||
let now = Utc::now().naive_utc();
|
||||
if let Ok(latest_interaciton) =
|
||||
@@ -440,8 +462,9 @@ impl WalreceiverState {
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let (new_sk_id, _, new_wal_producer_connstr) =
|
||||
self.select_connection_candidate(None)?;
|
||||
let (new_sk_id, _, new_wal_producer_connstr) = self
|
||||
.applicable_connection_candidates()
|
||||
.max_by_key(|(_, info, _)| info.commit_lsn)?;
|
||||
return Some(NewWalConnectionCandidate {
|
||||
safekeeper_id: new_sk_id,
|
||||
wal_producer_connstr: new_wal_producer_connstr,
|
||||
@@ -453,49 +476,6 @@ impl WalreceiverState {
|
||||
None
|
||||
}
|
||||
|
||||
/// Selects the best possible candidate, based on the data collected from etcd updates about the safekeepers.
|
||||
/// Optionally, omits the given node, to support gracefully switching from a healthy safekeeper to another.
|
||||
///
|
||||
/// The candidate that is chosen:
|
||||
/// * has fewest connection attempts from pageserver to safekeeper node (reset every time the WAL replication feedback is sent)
|
||||
/// * has greatest data Lsn among the ones that are left
|
||||
///
|
||||
/// NOTE:
|
||||
/// We evict timeline data received from etcd based on time passed since it was registered, along with its connection attempts values, but
|
||||
/// otherwise to reset the connection attempts, a successful connection to that node is needed.
|
||||
/// That won't happen now, before all nodes with less connection attempts are connected to first, which might leave the sk node with more advanced state to be ignored.
|
||||
fn select_connection_candidate(
|
||||
&self,
|
||||
node_to_omit: Option<NodeId>,
|
||||
) -> Option<(NodeId, &SkTimelineInfo, String)> {
|
||||
let all_candidates = self
|
||||
.applicable_connection_candidates()
|
||||
.filter(|&(sk_id, _, _)| Some(sk_id) != node_to_omit)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let smallest_attempts_allowed = all_candidates
|
||||
.iter()
|
||||
.map(|(sk_id, _, _)| {
|
||||
self.wal_connection_attempts
|
||||
.get(sk_id)
|
||||
.copied()
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.min()?;
|
||||
|
||||
all_candidates
|
||||
.into_iter()
|
||||
.filter(|(sk_id, _, _)| {
|
||||
smallest_attempts_allowed
|
||||
>= self
|
||||
.wal_connection_attempts
|
||||
.get(sk_id)
|
||||
.copied()
|
||||
.unwrap_or(0)
|
||||
})
|
||||
.max_by_key(|(_, info, _)| info.commit_lsn)
|
||||
}
|
||||
|
||||
fn applicable_connection_candidates(
|
||||
&self,
|
||||
) -> impl Iterator<Item = (NodeId, &SkTimelineInfo, String)> {
|
||||
@@ -520,25 +500,15 @@ impl WalreceiverState {
|
||||
}
|
||||
|
||||
fn cleanup_old_candidates(&mut self) {
|
||||
let mut node_ids_to_remove = Vec::with_capacity(self.wal_stream_candidates.len());
|
||||
|
||||
self.wal_stream_candidates.retain(|node_id, etcd_info| {
|
||||
self.wal_stream_candidates.retain(|_, etcd_info| {
|
||||
if let Ok(time_since_latest_etcd_update) =
|
||||
(Utc::now().naive_utc() - etcd_info.latest_update).to_std()
|
||||
{
|
||||
let should_retain = time_since_latest_etcd_update < self.lagging_wal_timeout;
|
||||
if !should_retain {
|
||||
node_ids_to_remove.push(*node_id);
|
||||
}
|
||||
should_retain
|
||||
time_since_latest_etcd_update < self.lagging_wal_timeout
|
||||
} else {
|
||||
true
|
||||
}
|
||||
});
|
||||
|
||||
for node_id in node_ids_to_remove {
|
||||
self.wal_connection_attempts.remove(&node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -873,64 +843,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("candidate_with_many_connection_failures")?;
|
||||
let mut state = dummy_state(&harness);
|
||||
let now = Utc::now().naive_utc();
|
||||
|
||||
let current_lsn = Lsn(100_000).align();
|
||||
let bigger_lsn = Lsn(current_lsn.0 + 100).align();
|
||||
|
||||
state.wal_connection = None;
|
||||
state.wal_stream_candidates = HashMap::from([
|
||||
(
|
||||
NodeId(0),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(bigger_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
(
|
||||
NodeId(1),
|
||||
EtcdSkTimeline {
|
||||
timeline: SkTimelineInfo {
|
||||
last_log_term: None,
|
||||
flush_lsn: None,
|
||||
commit_lsn: Some(current_lsn),
|
||||
backup_lsn: None,
|
||||
remote_consistent_lsn: None,
|
||||
peer_horizon_lsn: None,
|
||||
safekeeper_connstr: Some(DUMMY_SAFEKEEPER_CONNSTR.to_string()),
|
||||
},
|
||||
etcd_version: 0,
|
||||
latest_update: now,
|
||||
},
|
||||
),
|
||||
]);
|
||||
state.wal_connection_attempts = HashMap::from([(NodeId(0), 1), (NodeId(1), 0)]);
|
||||
|
||||
let candidate_with_less_errors = state
|
||||
.next_connection_candidate()
|
||||
.expect("Expected one candidate selected, but got none");
|
||||
assert_eq!(
|
||||
candidate_with_less_errors.safekeeper_id,
|
||||
NodeId(1),
|
||||
"Should select the node with less connection errors"
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn connection_no_etcd_data_candidate() -> anyhow::Result<()> {
|
||||
let harness = RepoHarness::create("connection_no_etcd_data_candidate")?;
|
||||
|
||||
@@ -623,7 +623,6 @@ impl PostgresRedoProcess {
|
||||
.env_clear()
|
||||
.env("LD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.env("DYLD_LIBRARY_PATH", conf.pg_lib_dir())
|
||||
.close_fds()
|
||||
.output()
|
||||
.map_err(|e| Error::new(e.kind(), format!("failed to execute initdb: {}", e)))?;
|
||||
|
||||
|
||||
@@ -49,12 +49,6 @@ impl UserFacingError for ConsoleAuthError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&auth::credentials::ClientCredsParseError> for ConsoleAuthError {
|
||||
fn from(e: &auth::credentials::ClientCredsParseError) -> Self {
|
||||
ConsoleAuthError::BadProjectName(e.clone())
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: convert into an enum with "error"
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
struct GetRoleSecretResponse {
|
||||
@@ -98,9 +92,14 @@ impl<'a> Api<'a> {
|
||||
|
||||
async fn get_auth_info(&self) -> Result<AuthInfo> {
|
||||
let mut url = self.endpoint.clone();
|
||||
let project_name = self
|
||||
.creds
|
||||
.project_name
|
||||
.as_ref()
|
||||
.map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?;
|
||||
url.path_segments_mut().push("proxy_get_role_secret");
|
||||
url.query_pairs_mut()
|
||||
.append_pair("project", self.creds.project_name.as_ref()?)
|
||||
.append_pair("project", project_name)
|
||||
.append_pair("role", &self.creds.user);
|
||||
|
||||
// TODO: use a proper logger
|
||||
@@ -122,8 +121,12 @@ impl<'a> Api<'a> {
|
||||
/// Wake up the compute node and return the corresponding connection info.
|
||||
async fn wake_compute(&self) -> Result<DatabaseInfo> {
|
||||
let mut url = self.endpoint.clone();
|
||||
let project_name = self
|
||||
.creds
|
||||
.project_name
|
||||
.as_ref()
|
||||
.map_err(|e| ConsoleAuthError::BadProjectName(e.clone()))?;
|
||||
url.path_segments_mut().push("proxy_wake_compute");
|
||||
let project_name = self.creds.project_name.as_ref()?;
|
||||
url.query_pairs_mut().append_pair("project", project_name);
|
||||
|
||||
// TODO: use a proper logger
|
||||
|
||||
@@ -115,7 +115,7 @@ mod tests {
|
||||
Ok(())
|
||||
});
|
||||
|
||||
waiter.await?;
|
||||
let () = waiter.await?;
|
||||
notifier.await?
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,11 +5,6 @@ use anyhow::Context;
|
||||
use anyhow::Error;
|
||||
use anyhow::Result;
|
||||
use etcd_broker::subscription_value::SkTimelineInfo;
|
||||
use etcd_broker::LeaseKeepAliveStream;
|
||||
use etcd_broker::LeaseKeeper;
|
||||
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::time::Duration;
|
||||
use tokio::spawn;
|
||||
use tokio::task::JoinHandle;
|
||||
@@ -26,7 +21,7 @@ use utils::zid::{NodeId, ZTenantTimelineId};
|
||||
|
||||
const RETRY_INTERVAL_MSEC: u64 = 1000;
|
||||
const PUSH_INTERVAL_MSEC: u64 = 1000;
|
||||
const LEASE_TTL_SEC: i64 = 10;
|
||||
const LEASE_TTL_SEC: i64 = 5;
|
||||
|
||||
pub fn thread_main(conf: SafeKeeperConf) {
|
||||
let runtime = runtime::Builder::new_current_thread()
|
||||
@@ -159,48 +154,13 @@ pub fn get_candiate_name(system_id: NodeId) -> String {
|
||||
format!("id_{system_id}")
|
||||
}
|
||||
|
||||
async fn push_sk_info(
|
||||
zttid: ZTenantTimelineId,
|
||||
mut client: Client,
|
||||
key: String,
|
||||
sk_info: SkTimelineInfo,
|
||||
mut lease: Lease,
|
||||
) -> anyhow::Result<(ZTenantTimelineId, Lease)> {
|
||||
let put_opts = PutOptions::new().with_lease(lease.id);
|
||||
client
|
||||
.put(
|
||||
key.clone(),
|
||||
serde_json::to_string(&sk_info)?,
|
||||
Some(put_opts),
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("failed to push safekeeper info to {}", key))?;
|
||||
|
||||
// revive the lease
|
||||
lease
|
||||
.keeper
|
||||
.keep_alive()
|
||||
.await
|
||||
.context("failed to send LeaseKeepAliveRequest")?;
|
||||
lease
|
||||
.ka_stream
|
||||
.message()
|
||||
.await
|
||||
.context("failed to receive LeaseKeepAliveResponse")?;
|
||||
|
||||
Ok((zttid, lease))
|
||||
}
|
||||
|
||||
struct Lease {
|
||||
id: i64,
|
||||
keeper: LeaseKeeper,
|
||||
ka_stream: LeaseKeepAliveStream,
|
||||
}
|
||||
|
||||
/// Push once in a while data about all active timelines to the broker.
|
||||
async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
let mut client = Client::connect(&conf.broker_endpoints, None).await?;
|
||||
let mut leases: HashMap<ZTenantTimelineId, Lease> = HashMap::new();
|
||||
|
||||
// Get and maintain lease to automatically delete obsolete data
|
||||
let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
|
||||
let (mut keeper, mut ka_stream) = client.lease_keep_alive(lease.id()).await?;
|
||||
|
||||
let push_interval = Duration::from_millis(PUSH_INTERVAL_MSEC);
|
||||
loop {
|
||||
@@ -208,46 +168,33 @@ async fn push_loop(conf: SafeKeeperConf) -> anyhow::Result<()> {
|
||||
// is under plain mutex. That's ok, all this code is not performance
|
||||
// sensitive and there is no risk of deadlock as we don't await while
|
||||
// lock is held.
|
||||
let active_tlis = GlobalTimelines::get_active_timelines();
|
||||
|
||||
// // Get and maintain (if not yet) per timeline lease to automatically delete obsolete data.
|
||||
for zttid in active_tlis.iter() {
|
||||
if let Entry::Vacant(v) = leases.entry(*zttid) {
|
||||
let lease = client.lease_grant(LEASE_TTL_SEC, None).await?;
|
||||
let (keeper, ka_stream) = client.lease_keep_alive(lease.id()).await?;
|
||||
v.insert(Lease {
|
||||
id: lease.id(),
|
||||
keeper,
|
||||
ka_stream,
|
||||
});
|
||||
for zttid in GlobalTimelines::get_active_timelines() {
|
||||
if let Some(tli) = GlobalTimelines::get_loaded(zttid) {
|
||||
let sk_info = tli.get_public_info(&conf)?;
|
||||
let put_opts = PutOptions::new().with_lease(lease.id());
|
||||
client
|
||||
.put(
|
||||
timeline_safekeeper_path(
|
||||
conf.broker_etcd_prefix.clone(),
|
||||
zttid,
|
||||
conf.my_id,
|
||||
),
|
||||
serde_json::to_string(&sk_info)?,
|
||||
Some(put_opts),
|
||||
)
|
||||
.await
|
||||
.context("failed to push safekeeper info")?;
|
||||
}
|
||||
}
|
||||
leases.retain(|zttid, _| active_tlis.contains(zttid));
|
||||
|
||||
// Push data concurrently to not suffer from latency, with many timelines it can be slow.
|
||||
let handles = active_tlis
|
||||
.iter()
|
||||
.filter_map(|zttid| GlobalTimelines::get_loaded(*zttid))
|
||||
.map(|tli| {
|
||||
let sk_info = tli.get_public_info(&conf);
|
||||
let key = timeline_safekeeper_path(
|
||||
conf.broker_etcd_prefix.clone(),
|
||||
tli.zttid,
|
||||
conf.my_id,
|
||||
);
|
||||
let lease = leases.remove(&tli.zttid).unwrap();
|
||||
tokio::spawn(push_sk_info(tli.zttid, client.clone(), key, sk_info, lease))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
for h in handles {
|
||||
let (zttid, lease) = h.await??;
|
||||
// It is ugly to pull leases from hash and then put it back, but
|
||||
// otherwise we have to resort to long living per tli tasks (which
|
||||
// would generate a lot of errors when etcd is down) as task wants to
|
||||
// have 'static objects, we can't borrow to it.
|
||||
leases.insert(zttid, lease);
|
||||
}
|
||||
|
||||
// revive the lease
|
||||
keeper
|
||||
.keep_alive()
|
||||
.await
|
||||
.context("failed to send LeaseKeepAliveRequest")?;
|
||||
ka_stream
|
||||
.message()
|
||||
.await
|
||||
.context("failed to receive LeaseKeepAliveResponse")?;
|
||||
sleep(push_interval).await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -239,19 +239,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: Peers(vec![]),
|
||||
});
|
||||
} else if version == 5 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
let mut oldstate = SafeKeeperState::des(&buf[..buf.len()])?;
|
||||
if oldstate.timeline_start_lsn != Lsn(0) {
|
||||
return Ok(oldstate);
|
||||
}
|
||||
|
||||
// set special timeline_start_lsn because we don't know the real one
|
||||
info!("setting timeline_start_lsn and local_start_lsn to Lsn(1)");
|
||||
oldstate.timeline_start_lsn = Lsn(1);
|
||||
oldstate.local_start_lsn = Lsn(1);
|
||||
|
||||
return Ok(oldstate);
|
||||
}
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
|
||||
@@ -28,7 +28,7 @@ use utils::{
|
||||
};
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 6;
|
||||
pub const SK_FORMAT_VERSION: u32 = 5;
|
||||
const SK_PROTOCOL_VERSION: u32 = 2;
|
||||
const UNKNOWN_SERVER_VERSION: u32 = 0;
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ use serde::Serialize;
|
||||
use tokio::sync::watch;
|
||||
|
||||
use std::cmp::{max, min};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::fs::{self};
|
||||
|
||||
use std::sync::{Arc, Mutex, MutexGuard};
|
||||
@@ -445,9 +445,9 @@ impl Timeline {
|
||||
}
|
||||
|
||||
/// Prepare public safekeeper info for reporting.
|
||||
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> SkTimelineInfo {
|
||||
pub fn get_public_info(&self, conf: &SafeKeeperConf) -> anyhow::Result<SkTimelineInfo> {
|
||||
let shared_state = self.mutex.lock().unwrap();
|
||||
SkTimelineInfo {
|
||||
Ok(SkTimelineInfo {
|
||||
last_log_term: Some(shared_state.sk.get_epoch()),
|
||||
flush_lsn: Some(shared_state.sk.wal_store.flush_lsn()),
|
||||
// note: this value is not flushed to control file yet and can be lost
|
||||
@@ -460,7 +460,7 @@ impl Timeline {
|
||||
peer_horizon_lsn: Some(shared_state.sk.inmem.peer_horizon_lsn),
|
||||
safekeeper_connstr: Some(conf.listen_pg_addr.clone()),
|
||||
backup_lsn: Some(shared_state.sk.inmem.backup_lsn),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// Update timeline state with peer safekeeper data.
|
||||
@@ -625,8 +625,6 @@ impl GlobalTimelines {
|
||||
zttid: ZTenantTimelineId,
|
||||
create: bool,
|
||||
) -> Result<Arc<Timeline>> {
|
||||
let _enter = info_span!("", timeline = %zttid.tenant_id).entered();
|
||||
|
||||
let mut state = TIMELINES_STATE.lock().unwrap();
|
||||
|
||||
match state.timelines.get(&zttid) {
|
||||
@@ -669,7 +667,7 @@ impl GlobalTimelines {
|
||||
}
|
||||
|
||||
/// Get ZTenantTimelineIDs of all active timelines.
|
||||
pub fn get_active_timelines() -> HashSet<ZTenantTimelineId> {
|
||||
pub fn get_active_timelines() -> Vec<ZTenantTimelineId> {
|
||||
let state = TIMELINES_STATE.lock().unwrap();
|
||||
state
|
||||
.timelines
|
||||
|
||||
@@ -2,16 +2,18 @@ use anyhow::{Context, Result};
|
||||
use etcd_broker::subscription_key::{
|
||||
NodeKind, OperationKind, SkOperationKind, SubscriptionKey, SubscriptionKind,
|
||||
};
|
||||
use tokio::io::AsyncRead;
|
||||
use tokio::task::JoinHandle;
|
||||
|
||||
use std::cmp::min;
|
||||
use std::collections::HashMap;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::pin::Pin;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use postgres_ffi::xlog_utils::{XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, PG_TLI};
|
||||
use postgres_ffi::xlog_utils::{
|
||||
XLogFileName, XLogSegNo, XLogSegNoOffsetToRecPtr, MAX_SEND_SIZE, PG_TLI,
|
||||
};
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorage};
|
||||
use tokio::fs::File;
|
||||
use tokio::runtime::Builder;
|
||||
@@ -450,41 +452,45 @@ async fn backup_object(source_file: &Path, size: usize) -> Result<()> {
|
||||
pub async fn read_object(
|
||||
file_path: PathBuf,
|
||||
offset: u64,
|
||||
) -> anyhow::Result<Pin<Box<dyn tokio::io::AsyncRead>>> {
|
||||
let download = match REMOTE_STORAGE
|
||||
.get()
|
||||
.context("Failed to get remote storage")?
|
||||
.as_ref()
|
||||
.context("No remote storage configured")?
|
||||
{
|
||||
GenericRemoteStorage::Local(local_storage) => {
|
||||
let source = local_storage.remote_object_id(&file_path)?;
|
||||
) -> (impl AsyncRead, JoinHandle<Result<()>>) {
|
||||
let storage = REMOTE_STORAGE.get().expect("failed to get remote storage");
|
||||
|
||||
info!(
|
||||
"local download about to start from {} at offset {}",
|
||||
source.display(),
|
||||
offset
|
||||
);
|
||||
local_storage
|
||||
.download_byte_range(&source, offset, None)
|
||||
.await
|
||||
let (mut pipe_writer, pipe_reader) = tokio::io::duplex(MAX_SEND_SIZE);
|
||||
|
||||
let copy_result = tokio::spawn(async move {
|
||||
let res = match storage.as_ref().unwrap() {
|
||||
GenericRemoteStorage::Local(local_storage) => {
|
||||
let source = local_storage.remote_object_id(&file_path)?;
|
||||
|
||||
info!(
|
||||
"local download about to start from {} at offset {}",
|
||||
source.display(),
|
||||
offset
|
||||
);
|
||||
local_storage
|
||||
.download_byte_range(&source, offset, None, &mut pipe_writer)
|
||||
.await
|
||||
}
|
||||
GenericRemoteStorage::S3(s3_storage) => {
|
||||
let s3key = s3_storage.remote_object_id(&file_path)?;
|
||||
|
||||
info!(
|
||||
"S3 download about to start from {:?} at offset {}",
|
||||
s3key, offset
|
||||
);
|
||||
s3_storage
|
||||
.download_byte_range(&s3key, offset, None, &mut pipe_writer)
|
||||
.await
|
||||
}
|
||||
};
|
||||
|
||||
if let Err(e) = res {
|
||||
error!("failed to download WAL segment from remote storage: {}", e);
|
||||
Err(e)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
GenericRemoteStorage::S3(s3_storage) => {
|
||||
let s3key = s3_storage.remote_object_id(&file_path)?;
|
||||
});
|
||||
|
||||
info!(
|
||||
"S3 download about to start from {:?} at offset {}",
|
||||
s3key, offset
|
||||
);
|
||||
s3_storage.download_byte_range(&s3key, offset, None).await
|
||||
}
|
||||
}
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to open WAL segment download stream for local storage path {}",
|
||||
file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(download.download_stream)
|
||||
(pipe_reader, copy_result)
|
||||
}
|
||||
|
||||
@@ -604,7 +604,8 @@ impl WalReader {
|
||||
|
||||
// Try to open remote file, if remote reads are enabled
|
||||
if self.enable_remote_read {
|
||||
return read_object(wal_file_path, xlogoff as u64).await;
|
||||
let (reader, _) = read_object(wal_file_path, xlogoff as u64).await;
|
||||
return Ok(Box::pin(reader));
|
||||
}
|
||||
|
||||
bail!("WAL segment is not found")
|
||||
|
||||
@@ -28,10 +28,6 @@ strict = true
|
||||
# There is some work in progress, though: https://github.com/MagicStack/asyncpg/pull/577
|
||||
ignore_missing_imports = true
|
||||
|
||||
[mypy-pg8000.*]
|
||||
# Used only in testing clients
|
||||
ignore_missing_imports = true
|
||||
|
||||
[mypy-cached_property.*]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ If you want to run all tests that have the string "bench" in their names:
|
||||
|
||||
Useful environment variables:
|
||||
|
||||
`NEON_BIN`: The directory where neon binaries can be found.
|
||||
`ZENITH_BIN`: The directory where zenith binaries can be found.
|
||||
`POSTGRES_DISTRIB_DIR`: The directory where postgres distribution can be found.
|
||||
`TEST_OUTPUT`: Set the directory where test state and test output files
|
||||
should go.
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
from contextlib import closing
|
||||
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, NeonPageserverApiException
|
||||
@@ -105,3 +108,16 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
branch2_cur.execute('SELECT count(*) FROM foo')
|
||||
assert branch2_cur.fetchone() == (300000, )
|
||||
|
||||
|
||||
def test_ancestor_branch_detach(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_detach_parent", "empty")
|
||||
|
||||
env.neon_cli.create_branch("test_ancestor_branch_detach_branch1",
|
||||
"test_ancestor_branch_detach_parent")
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
with pytest.raises(NeonPageserverApiException, match="Failed to detach inmem tenant timeline"):
|
||||
ps_http.timeline_detach(env.initial_tenant, parent_timeline_id)
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from contextlib import closing
|
||||
from uuid import uuid4
|
||||
from typing import Iterator
|
||||
from uuid import UUID, uuid4
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserverApiException
|
||||
from requests.exceptions import HTTPError
|
||||
import pytest
|
||||
|
||||
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
from contextlib import closing, contextmanager
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder
|
||||
from fixtures.neon_fixtures import PgProtocol, NeonEnvBuilder
|
||||
from fixtures.log_helper import log
|
||||
import os
|
||||
import time
|
||||
import asyncpg
|
||||
from fixtures.neon_fixtures import Postgres
|
||||
import threading
|
||||
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import pytest
|
||||
from contextlib import closing
|
||||
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
#
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.utils import lsn_from_hex
|
||||
|
||||
|
||||
# Test the GC implementation when running with branching.
|
||||
# This test reproduces the issue https://github.com/neondatabase/neon/issues/707.
|
||||
#
|
||||
# Consider two LSNs `lsn1` and `lsn2` with some delta files as follows:
|
||||
# ...
|
||||
# p -> has an image layer xx_p with p < lsn1
|
||||
# ...
|
||||
# lsn1
|
||||
# ...
|
||||
# q -> has an image layer yy_q with lsn1 < q < lsn2
|
||||
# ...
|
||||
# lsn2
|
||||
#
|
||||
# Consider running a GC iteration such that the GC horizon is between p and lsn1
|
||||
# ...
|
||||
# p -> has an image layer xx_p with p < lsn1
|
||||
# D_start -> is a delta layer D's start (e.g D = '...-...-D_start-D_end')
|
||||
# ...
|
||||
# GC_h -> is a gc horizon such that p < GC_h < lsn1
|
||||
# ...
|
||||
# lsn1
|
||||
# ...
|
||||
# D_end -> is a delta layer D's end
|
||||
# ...
|
||||
# q -> has an image layer yy_q with lsn1 < q < lsn2
|
||||
# ...
|
||||
# lsn2
|
||||
#
|
||||
# As described in the issue #707, the image layer xx_p will be deleted as
|
||||
# its range is below the GC horizon and there exists a newer image layer yy_q (q > p).
|
||||
# However, removing xx_p will corrupt any delta layers that depend on xx_p that
|
||||
# are not deleted by GC. For example, the delta layer D is corrupted in the
|
||||
# above example because D depends on the image layer xx_p for value reconstruction.
|
||||
#
|
||||
# Because the delta layer D covering lsn1 is corrupted, creating a branch
|
||||
# starting from lsn1 should return an error as follows:
|
||||
# could not find data for key ... at LSN ..., for request at LSN ...
|
||||
def test_branch_and_gc(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
# disable background GC
|
||||
'gc_period': '10 m',
|
||||
'gc_horizon': f'{10 * 1024 ** 3}',
|
||||
|
||||
# small checkpoint distance to create more delta layer files
|
||||
'checkpoint_distance': f'{1024 ** 2}',
|
||||
|
||||
# set the target size to be large to allow the image layer to cover the whole key space
|
||||
'compaction_target_size': f'{1024 ** 3}',
|
||||
|
||||
# tweak the default settings to allow quickly create image layers and L1 layers
|
||||
'compaction_period': '1 s',
|
||||
'compaction_threshold': '2',
|
||||
'image_creation_threshold': '1',
|
||||
|
||||
# set PITR interval to be small, so we can do GC
|
||||
'pitr_interval': '1 s'
|
||||
})
|
||||
|
||||
timeline_main = env.neon_cli.create_timeline(f'test_main', tenant_id=tenant)
|
||||
pg_main = env.postgres.create_start('test_main', tenant_id=tenant)
|
||||
|
||||
main_cur = pg_main.connect().cursor()
|
||||
|
||||
main_cur.execute(
|
||||
"CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')"
|
||||
)
|
||||
main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn1 = main_cur.fetchone()[0]
|
||||
log.info(f'LSN1: {lsn1}')
|
||||
|
||||
main_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
|
||||
main_cur.execute('SELECT pg_current_wal_insert_lsn()')
|
||||
lsn2 = main_cur.fetchone()[0]
|
||||
log.info(f'LSN2: {lsn2}')
|
||||
|
||||
# Set the GC horizon so that lsn1 is inside the horizon, which means
|
||||
# we can create a new branch starting from lsn1.
|
||||
env.pageserver.safe_psql(
|
||||
f'''do_gc {tenant.hex} {timeline_main.hex} {lsn_from_hex(lsn2) - lsn_from_hex(lsn1) + 1024}'''
|
||||
)
|
||||
|
||||
env.neon_cli.create_branch('test_branch',
|
||||
'test_main',
|
||||
tenant_id=tenant,
|
||||
ancestor_start_lsn=lsn1)
|
||||
pg_branch = env.postgres.create_start('test_branch', tenant_id=tenant)
|
||||
|
||||
branch_cur = pg_branch.connect().cursor()
|
||||
branch_cur.execute('INSERT INTO foo SELECT FROM generate_series(1, 100000)')
|
||||
|
||||
branch_cur.execute('SELECT count(*) FROM foo')
|
||||
assert branch_cur.fetchone() == (200000, )
|
||||
@@ -1,3 +1,4 @@
|
||||
import subprocess
|
||||
from contextlib import closing
|
||||
|
||||
import psycopg2.extras
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
from typing import List
|
||||
import threading
|
||||
import pytest
|
||||
from fixtures.neon_fixtures import NeonEnv, PgBin, Postgres
|
||||
import time
|
||||
import random
|
||||
from fixtures.log_helper import log
|
||||
from performance.test_perf_pgbench import get_scales_matrix
|
||||
|
||||
|
||||
# Test branch creation
|
||||
#
|
||||
# This test spawns pgbench in a thread in the background, and creates a branch while
|
||||
# pgbench is running. Then it launches pgbench on the new branch, and creates another branch.
|
||||
# Repeat `n_branches` times.
|
||||
#
|
||||
# If 'ty' == 'cascade', each branch is created from the previous branch, so that you end
|
||||
# up with a branch of a branch of a branch ... of a branch. With 'ty' == 'flat',
|
||||
# each branch is created from the root.
|
||||
@pytest.mark.parametrize("n_branches", [10])
|
||||
@pytest.mark.parametrize("scale", get_scales_matrix(1))
|
||||
@pytest.mark.parametrize("ty", ["cascade", "flat"])
|
||||
def test_branching_with_pgbench(neon_simple_env: NeonEnv,
|
||||
pg_bin: PgBin,
|
||||
n_branches: int,
|
||||
scale: int,
|
||||
ty: str):
|
||||
env = neon_simple_env
|
||||
|
||||
# Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
conf={
|
||||
'gc_period': '5 s',
|
||||
'gc_horizon': f'{1024 ** 2}',
|
||||
'checkpoint_distance': f'{1024 ** 2}',
|
||||
'compaction_target_size': f'{1024 ** 2}',
|
||||
# set PITR interval to be small, so we can do GC
|
||||
'pitr_interval': '5 s'
|
||||
})
|
||||
|
||||
def run_pgbench(pg: Postgres):
|
||||
connstr = pg.connstr()
|
||||
|
||||
log.info(f"Start a pgbench workload on pg {connstr}")
|
||||
|
||||
pg_bin.run_capture(['pgbench', '-i', f'-s{scale}', connstr])
|
||||
pg_bin.run_capture(['pgbench', '-c10', '-T15', connstr])
|
||||
|
||||
env.neon_cli.create_branch('b0', tenant_id=tenant)
|
||||
pgs: List[Postgres] = []
|
||||
pgs.append(env.postgres.create_start('b0', tenant_id=tenant))
|
||||
|
||||
threads: List[threading.Thread] = []
|
||||
threads.append(threading.Thread(target=run_pgbench, args=(pgs[0], ), daemon=True))
|
||||
threads[-1].start()
|
||||
|
||||
for i in range(n_branches):
|
||||
# random a delay between [0, 5]
|
||||
delay = random.random() * 5
|
||||
time.sleep(delay)
|
||||
log.info(f"Sleep {delay}s")
|
||||
|
||||
if ty == "cascade":
|
||||
env.neon_cli.create_branch('b{}'.format(i + 1), 'b{}'.format(i), tenant_id=tenant)
|
||||
else:
|
||||
env.neon_cli.create_branch('b{}'.format(i + 1), 'b0', tenant_id=tenant)
|
||||
|
||||
pgs.append(env.postgres.create_start('b{}'.format(i + 1), tenant_id=tenant))
|
||||
|
||||
threads.append(threading.Thread(target=run_pgbench, args=(pgs[-1], ), daemon=True))
|
||||
threads[-1].start()
|
||||
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
for pg in pgs:
|
||||
res = pg.safe_psql('SELECT count(*) from pgbench_accounts')
|
||||
assert res[0] == (100000 * scale, )
|
||||
@@ -110,6 +110,6 @@ def test_fix_broken_timelines_on_startup(neon_simple_env: NeonEnv):
|
||||
env.neon_cli.pageserver_stop(immediate=True)
|
||||
env.neon_cli.pageserver_start()
|
||||
|
||||
# Check that tenant with "broken" timeline is not loaded.
|
||||
with pytest.raises(Exception, match=f"Failed to get repo for tenant {tenant_id.hex}"):
|
||||
env.neon_cli.list_timelines(tenant_id)
|
||||
# Check that the "broken" timeline is not loaded
|
||||
timelines = env.neon_cli.list_timelines(tenant_id)
|
||||
assert len(timelines) == 1
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
from contextlib import closing
|
||||
import shutil
|
||||
import time
|
||||
import subprocess
|
||||
import os.path
|
||||
|
||||
from cached_property import threading
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
def lsof_path() -> str:
|
||||
path_output = shutil.which("lsof")
|
||||
if path_output is None:
|
||||
raise RuntimeError('lsof not found in PATH')
|
||||
else:
|
||||
return path_output
|
||||
|
||||
|
||||
# Makes sure that `pageserver.pid` is only held by `pageserve` command, not other commands.
|
||||
# This is to test the changes in https://github.com/neondatabase/neon/pull/1834.
|
||||
def test_lsof_pageserver_pid(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
|
||||
def start_workload():
|
||||
env.neon_cli.create_branch("test_lsof_pageserver_pid")
|
||||
pg = env.postgres.create_start("test_lsof_pageserver_pid")
|
||||
with closing(pg.connect()) as conn:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("CREATE TABLE foo as SELECT x FROM generate_series(1,100000) x")
|
||||
cur.execute("update foo set x=x+1")
|
||||
|
||||
workload_thread = threading.Thread(target=start_workload, args=(), daemon=True)
|
||||
workload_thread.start()
|
||||
|
||||
path = os.path.join(env.repo_dir, "pageserver.pid")
|
||||
lsof = lsof_path()
|
||||
while workload_thread.is_alive():
|
||||
res = subprocess.run([lsof, path],
|
||||
check=False,
|
||||
universal_newlines=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
# parse the `lsof` command's output to get only the list of commands
|
||||
commands = [line.split(' ')[0] for line in res.stdout.strip().split('\n')[1:]]
|
||||
if len(commands) > 0:
|
||||
log.info(f"lsof commands: {commands}")
|
||||
assert commands == ['pageserve']
|
||||
|
||||
time.sleep(1.0)
|
||||
@@ -1,63 +0,0 @@
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
|
||||
from fixtures.log_helper import log
|
||||
import pytest
|
||||
|
||||
# Restart nodes with WAL end having specially crafted shape, like last record
|
||||
# crossing segment boundary, to test decoding issues.
|
||||
|
||||
|
||||
@pytest.mark.parametrize('wal_type',
|
||||
[
|
||||
'simple',
|
||||
'last_wal_record_xlog_switch',
|
||||
'last_wal_record_xlog_switch_ends_on_page_boundary',
|
||||
'last_wal_record_crossing_segment',
|
||||
'wal_record_crossing_segment_followed_by_small_one',
|
||||
])
|
||||
def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str):
|
||||
neon_env_builder.num_safekeepers = 1
|
||||
env = neon_env_builder.init_start()
|
||||
env.neon_cli.create_branch('test_crafted_wal_end')
|
||||
|
||||
pg = env.postgres.create('test_crafted_wal_end')
|
||||
wal_craft = WalCraft(env)
|
||||
pg.config(wal_craft.postgres_config())
|
||||
pg.start()
|
||||
res = pg.safe_psql_many(queries=[
|
||||
'CREATE TABLE keys(key int primary key)',
|
||||
'INSERT INTO keys SELECT generate_series(1, 100)',
|
||||
'SELECT SUM(key) FROM keys'
|
||||
])
|
||||
assert res[-1][0] == (5050, )
|
||||
|
||||
wal_craft.in_existing(wal_type, pg.connstr())
|
||||
|
||||
log.info("Restarting all safekeepers and pageservers")
|
||||
env.pageserver.stop()
|
||||
env.safekeepers[0].stop()
|
||||
env.safekeepers[0].start()
|
||||
env.pageserver.start()
|
||||
|
||||
log.info("Trying more queries")
|
||||
res = pg.safe_psql_many(queries=[
|
||||
'SELECT SUM(key) FROM keys',
|
||||
'INSERT INTO keys SELECT generate_series(101, 200)',
|
||||
'SELECT SUM(key) FROM keys',
|
||||
])
|
||||
assert res[0][0] == (5050, )
|
||||
assert res[-1][0] == (20100, )
|
||||
|
||||
log.info("Restarting all safekeepers and pageservers (again)")
|
||||
env.pageserver.stop()
|
||||
env.safekeepers[0].stop()
|
||||
env.safekeepers[0].start()
|
||||
env.pageserver.start()
|
||||
|
||||
log.info("Trying more queries (again)")
|
||||
res = pg.safe_psql_many(queries=[
|
||||
'SELECT SUM(key) FROM keys',
|
||||
'INSERT INTO keys SELECT generate_series(201, 300)',
|
||||
'SELECT SUM(key) FROM keys',
|
||||
])
|
||||
assert res[0][0] == (20100, )
|
||||
assert res[-1][0] == (45150, )
|
||||
@@ -1,10 +1,16 @@
|
||||
import subprocess
|
||||
from contextlib import closing
|
||||
|
||||
import psycopg2.extras
|
||||
import pytest
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, PortDistributor, VanillaPostgres
|
||||
from fixtures.neon_fixtures import pg_distrib_dir
|
||||
import os
|
||||
from fixtures.utils import subprocess_capture
|
||||
from fixtures.utils import mkdir_if_needed, subprocess_capture
|
||||
import shutil
|
||||
import getpass
|
||||
import pwd
|
||||
|
||||
num_rows = 1000
|
||||
|
||||
@@ -40,20 +46,19 @@ def test_fullbackup(neon_env_builder: NeonEnvBuilder,
|
||||
psql_env = {'LD_LIBRARY_PATH': os.path.join(str(pg_distrib_dir), 'lib')}
|
||||
|
||||
# Get and unpack fullbackup from pageserver
|
||||
restored_dir_path = env.repo_dir / "restored_datadir"
|
||||
restored_dir_path = os.path.join(env.repo_dir, "restored_datadir")
|
||||
os.mkdir(restored_dir_path, 0o750)
|
||||
query = f"fullbackup {env.initial_tenant.hex} {timeline} {lsn}"
|
||||
cmd = ["psql", "--no-psqlrc", env.pageserver.connstr(), "-c", query]
|
||||
result_basepath = pg_bin.run_capture(cmd, env=psql_env)
|
||||
tar_output_file = result_basepath + ".stdout"
|
||||
subprocess_capture(str(env.repo_dir),
|
||||
["tar", "-xf", tar_output_file, "-C", str(restored_dir_path)])
|
||||
subprocess_capture(str(env.repo_dir), ["tar", "-xf", tar_output_file, "-C", restored_dir_path])
|
||||
|
||||
# HACK
|
||||
# fullbackup returns neon specific pg_control and first WAL segment
|
||||
# use resetwal to overwrite it
|
||||
pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, 'pg_resetwal')
|
||||
cmd = [pg_resetwal_path, "-D", str(restored_dir_path)]
|
||||
cmd = [pg_resetwal_path, "-D", restored_dir_path]
|
||||
pg_bin.run_capture(cmd, env=psql_env)
|
||||
|
||||
# Restore from the backup and find the data we inserted
|
||||
|
||||
@@ -90,7 +90,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
|
||||
# Clean up
|
||||
# TODO it should clean itself
|
||||
client = env.pageserver.http_client()
|
||||
client.timeline_delete(tenant, timeline)
|
||||
client.timeline_detach(tenant, timeline)
|
||||
|
||||
# Importing correct backup works
|
||||
import_tar(base_tar, wal_tar)
|
||||
@@ -191,8 +191,3 @@ def test_import_from_pageserver(test_output_dir, pg_bin, vanilla_pg, neon_env_bu
|
||||
# Check it's the same as the first fullbackup
|
||||
# TODO pageserver should be checking checksum
|
||||
assert os.path.getsize(tar_output_file) == os.path.getsize(new_tar_output_file)
|
||||
|
||||
# Check that gc works
|
||||
psconn = env.pageserver.connect()
|
||||
pscur = psconn.cursor()
|
||||
pscur.execute(f"do_gc {tenant.hex} {timeline} 0")
|
||||
|
||||
@@ -24,7 +24,7 @@ def check_tenant(env: NeonEnv, pageserver_http: NeonPageserverHttpClient):
|
||||
assert res_2[0] == (5000050000, )
|
||||
|
||||
pg.stop()
|
||||
pageserver_http.tenant_detach(tenant_id)
|
||||
pageserver_http.timeline_detach(tenant_id, timeline_id)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('num_timelines,num_safekeepers', [(3, 1)])
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
# It's possible to run any regular test with the local fs remote storage via
|
||||
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
|
||||
# env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/zenith_zzz/'}" poetry ......
|
||||
|
||||
import shutil, os
|
||||
from contextlib import closing
|
||||
from pathlib import Path
|
||||
import time
|
||||
from uuid import UUID
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, assert_timeline_local, wait_until, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.neon_fixtures import NeonEnvBuilder, assert_local, wait_until, wait_for_last_record_lsn, wait_for_upload
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.utils import lsn_from_hex, lsn_to_hex
|
||||
import pytest
|
||||
@@ -91,14 +91,14 @@ def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, sto
|
||||
# Introduce failpoint in download
|
||||
env.pageserver.safe_psql(f"failpoints remote-storage-download-pre-rename=return")
|
||||
|
||||
client.tenant_attach(UUID(tenant_id))
|
||||
client.timeline_attach(UUID(tenant_id), UUID(timeline_id))
|
||||
|
||||
# is there a better way to assert that failpoint triggered?
|
||||
# is there a better way to assert that fafilpoint triggered?
|
||||
time.sleep(10)
|
||||
|
||||
# assert cannot attach timeline that is scheduled for download
|
||||
with pytest.raises(Exception, match="Conflict: Tenant download is already in progress"):
|
||||
client.tenant_attach(UUID(tenant_id))
|
||||
with pytest.raises(Exception, match="Timeline download is already in progress"):
|
||||
client.timeline_attach(UUID(tenant_id), UUID(timeline_id))
|
||||
|
||||
detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id))
|
||||
log.info("Timeline detail with active failpoint: %s", detail)
|
||||
@@ -109,12 +109,12 @@ def test_remote_storage_backup_and_restore(neon_env_builder: NeonEnvBuilder, sto
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
|
||||
client.tenant_attach(UUID(tenant_id))
|
||||
client.timeline_attach(UUID(tenant_id), UUID(timeline_id))
|
||||
|
||||
log.info("waiting for timeline redownload")
|
||||
wait_until(number_of_iterations=10,
|
||||
interval=1,
|
||||
func=lambda: assert_timeline_local(client, UUID(tenant_id), UUID(timeline_id)))
|
||||
func=lambda: assert_local(client, UUID(tenant_id), UUID(timeline_id)))
|
||||
|
||||
detail = client.timeline_detail(UUID(tenant_id), UUID(timeline_id))
|
||||
assert detail['local'] is not None
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user