mirror of
https://github.com/neondatabase/neon.git
synced 2026-03-06 09:50:38 +00:00
Compare commits
3 Commits
f88fe021-i
...
fuzz-test-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a74ca297e1 | ||
|
|
c8c76a8b99 | ||
|
|
36673ff709 |
2
.circleci/ansible/.gitignore
vendored
2
.circleci/ansible/.gitignore
vendored
@@ -1,4 +1,2 @@
|
||||
zenith_install.tar.gz
|
||||
.zenith_current_version
|
||||
neon_install.tar.gz
|
||||
.neon_current_version
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
- name: Upload Neon binaries
|
||||
- name: Upload Zenith binaries
|
||||
hosts: storage
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
|
||||
tasks:
|
||||
|
||||
- name: get latest version of Neon binaries
|
||||
- name: get latest version of Zenith binaries
|
||||
register: current_version_file
|
||||
set_fact:
|
||||
current_version: "{{ lookup('file', '.neon_current_version') | trim }}"
|
||||
current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
@@ -19,11 +19,11 @@
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: upload and extract Neon binaries to /usr/local
|
||||
- name: upload and extract Zenith binaries to /usr/local
|
||||
ansible.builtin.unarchive:
|
||||
owner: root
|
||||
group: root
|
||||
src: neon_install.tar.gz
|
||||
src: zenith_install.tar.gz
|
||||
dest: /usr/local
|
||||
become: true
|
||||
tags:
|
||||
|
||||
@@ -3,31 +3,50 @@
|
||||
set -e
|
||||
|
||||
RELEASE=${RELEASE:-false}
|
||||
VERSION="1"
|
||||
TAG="backport-f88fe021-import-patch-${VERSION}"
|
||||
|
||||
# look at docker hub for latest tag fo zenith docker image
|
||||
if [ "${RELEASE}" = "true" ]; then
|
||||
echo "search latest relase tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
else
|
||||
TAG="release-${VERSION}"
|
||||
fi
|
||||
else
|
||||
echo "search latest dev tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
else
|
||||
TAG="${VERSION}"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "found ${VERSION}"
|
||||
|
||||
# do initial cleanup
|
||||
rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
|
||||
mkdir neon_install
|
||||
rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version
|
||||
mkdir zenith_install
|
||||
|
||||
# retrive binaries from docker image
|
||||
echo "getting binaries from docker image"
|
||||
docker pull --quiet neondatabase/neon:${TAG}
|
||||
ID=$(docker create neondatabase/neon:${TAG})
|
||||
docker pull --quiet zenithdb/zenith:${TAG}
|
||||
ID=$(docker create zenithdb/zenith:${TAG})
|
||||
docker cp ${ID}:/data/postgres_install.tar.gz .
|
||||
tar -xzf postgres_install.tar.gz -C neon_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/
|
||||
tar -xzf postgres_install.tar.gz -C zenith_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/
|
||||
docker rm -vf ${ID}
|
||||
|
||||
# store version to file (for ansible playbooks) and create binaries tarball
|
||||
echo ${VERSION} > neon_install/.neon_current_version
|
||||
echo ${VERSION} > .neon_current_version
|
||||
tar -czf neon_install.tar.gz -C neon_install .
|
||||
echo ${VERSION} > zenith_install/.zenith_current_version
|
||||
echo ${VERSION} > .zenith_current_version
|
||||
tar -czf zenith_install.tar.gz -C zenith_install .
|
||||
|
||||
# do final cleaup
|
||||
rm -rf neon_install postgres_install.tar.gz
|
||||
rm -rf zenith_install postgres_install.tar.gz
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
[pageservers]
|
||||
neon-stress-ps-1 console_region_id=1
|
||||
neon-stress-ps-2 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
neon-stress-sk-1 console_region_id=1
|
||||
neon-stress-sk-2 console_region_id=1
|
||||
neon-stress-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
console_mgmt_base_url = http://neon-stress-console.local
|
||||
bucket_name = neon-storage-ireland
|
||||
bucket_region = eu-west-1
|
||||
etcd_endpoints = etcd-stress.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
@@ -1,7 +1,10 @@
|
||||
[pageservers]
|
||||
zenith-1-ps-3 console_region_id=1
|
||||
zenith-1-ps-1 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
zenith-1-sk-1 console_region_id=1
|
||||
zenith-1-sk-2 console_region_id=1
|
||||
zenith-1-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
@@ -11,5 +14,3 @@ safekeepers
|
||||
console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
etcd_endpoints = etcd-release.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
|
||||
@@ -4,9 +4,8 @@ zenith-us-stage-ps-2 console_region_id=27
|
||||
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-1 console_region_id=27
|
||||
zenith-us-stage-sk-2 console_region_id=27
|
||||
zenith-us-stage-sk-4 console_region_id=27
|
||||
zenith-us-stage-sk-5 console_region_id=27
|
||||
zenith-us-stage-sk-6 console_region_id=27
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
@@ -16,5 +15,3 @@ safekeepers
|
||||
console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
etcd_endpoints = etcd-staging.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
version: 2.1
|
||||
|
||||
executors:
|
||||
neon-xlarge-executor:
|
||||
zenith-xlarge-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
|
||||
- image: zimg/rust:1.58
|
||||
neon-executor:
|
||||
zenith-executor:
|
||||
docker:
|
||||
- image: zimg/rust:1.58
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
@@ -22,7 +22,7 @@ jobs:
|
||||
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -67,9 +67,9 @@ jobs:
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
# A job to build Neon rust code
|
||||
build-neon:
|
||||
executor: neon-xlarge-executor
|
||||
# A job to build zenith rust code
|
||||
build-zenith:
|
||||
executor: zenith-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -113,7 +113,7 @@ jobs:
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS="--release --features profiling"
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
export CARGO_INCREMENTAL=0
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
export RUSTC_WRAPPER=cachepot
|
||||
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
||||
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
cachepot -s
|
||||
|
||||
- save_cache:
|
||||
@@ -132,6 +132,20 @@ jobs:
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run style checks
|
||||
# has to run separately from cargo fmt section
|
||||
# since needs to run with dependencies
|
||||
- run:
|
||||
name: cargo clippy
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" ./run_clippy.sh
|
||||
|
||||
# Run rust unit tests
|
||||
- run:
|
||||
name: cargo test
|
||||
@@ -209,7 +223,7 @@ jobs:
|
||||
- "*"
|
||||
|
||||
check-codestyle-python:
|
||||
executor: neon-executor
|
||||
executor: zenith-executor
|
||||
steps:
|
||||
- checkout
|
||||
- restore_cache:
|
||||
@@ -222,12 +236,6 @@ jobs:
|
||||
key: v2-python-deps-{{ checksum "poetry.lock" }}
|
||||
paths:
|
||||
- /home/circleci/.cache/pypoetry/virtualenvs
|
||||
- run:
|
||||
name: Print versions
|
||||
when: always
|
||||
command: |
|
||||
poetry run python --version
|
||||
poetry show
|
||||
- run:
|
||||
name: Run yapf to ensure code format
|
||||
when: always
|
||||
@@ -238,7 +246,7 @@ jobs:
|
||||
command: poetry run mypy .
|
||||
|
||||
run-pytest:
|
||||
executor: neon-executor
|
||||
executor: zenith-executor
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
@@ -361,7 +369,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -382,7 +390,7 @@ jobs:
|
||||
- "*"
|
||||
|
||||
coverage-report:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
@@ -412,7 +420,7 @@ jobs:
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
||||
|
||||
@@ -429,7 +437,7 @@ jobs:
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
# Build neondatabase/neon:latest image and push it to Docker hub
|
||||
# Build zenithdb/zenith:latest image and push it to Docker hub
|
||||
docker-image:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -443,18 +451,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:latest
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:latest
|
||||
|
||||
# Build neondatabase/compute-node:latest image and push it to Docker hub
|
||||
# Build zenithdb/compute-node:latest image and push it to Docker hub
|
||||
docker-image-compute:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -462,33 +470,31 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build zenithdb/compute-tools:latest image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:local \
|
||||
--tag neondatabase/compute-tools:latest \
|
||||
-f Dockerfile.compute-tools .
|
||||
# Only push :latest image
|
||||
docker push neondatabase/compute-tools:latest
|
||||
--tag zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:latest
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
|
||||
--tag neondatabase/compute-node:latest vendor/postgres \
|
||||
--build-arg COMPUTE_TOOLS_TAG=local
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:latest
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:latest
|
||||
|
||||
# Build production neondatabase/neon:release image and push it to Docker hub
|
||||
# Build production zenithdb/zenith:release image and push it to Docker hub
|
||||
docker-image-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -502,17 +508,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="backport-f88fe021-import-patch-1"
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:release
|
||||
|
||||
# Build production neondatabase/compute-node:release image and push it to Docker hub
|
||||
# Build production zenithdb/compute-node:release image and push it to Docker hub
|
||||
docker-image-compute-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -520,33 +527,29 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build zenithdb/compute-tools:release image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
# don't build new compute image
|
||||
# docker build \
|
||||
# --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
# --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
# --tag neondatabase/compute-tools:release \
|
||||
# --tag neondatabase/compute-tools:local \
|
||||
# -f Dockerfile.compute-tools .
|
||||
# # Only push :release image
|
||||
# docker push neondatabase/compute-tools:release
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag zenithdb/compute-tools:release -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:release
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
# don't build new compute image
|
||||
# docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
|
||||
# --tag neondatabase/compute-node:release vendor/postgres \
|
||||
# --build-arg COMPUTE_TOOLS_TAG=local
|
||||
# docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
# docker push neondatabase/compute-node:release
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:release
|
||||
|
||||
deploy-staging:
|
||||
docker:
|
||||
@@ -572,7 +575,7 @@ jobs:
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i staging.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
|
||||
deploy-staging-proxy:
|
||||
docker:
|
||||
@@ -590,63 +593,13 @@ jobs:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
helm repo add zenithdb https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-neon-stress:
|
||||
docker:
|
||||
- image: cimg/python:3.10
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker
|
||||
- run:
|
||||
name: Setup ansible
|
||||
command: |
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
- run:
|
||||
name: Redeploy
|
||||
command: |
|
||||
cd "$(pwd)/.circleci/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i neon-stress.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-neon-stress-proxy:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
environment:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Store kubeconfig file
|
||||
command: |
|
||||
echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
- run:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-release:
|
||||
docker:
|
||||
@@ -672,7 +625,7 @@ jobs:
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i production.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
|
||||
deploy-release-proxy:
|
||||
docker:
|
||||
@@ -681,7 +634,21 @@ jobs:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- checkout
|
||||
# don't deploy proxy
|
||||
- run:
|
||||
name: Store kubeconfig file
|
||||
command: |
|
||||
echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
- run:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add zenithdb https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
# Trigger a new remote CI job
|
||||
remote-ci-trigger:
|
||||
@@ -737,8 +704,8 @@ workflows:
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-neon:
|
||||
name: build-neon-<< matrix.build_type >>
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
@@ -753,7 +720,7 @@ workflows:
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
requires:
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
@@ -761,7 +728,7 @@ workflows:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_others
|
||||
requires:
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: benchmarks
|
||||
context: PERF_TEST_RESULT_CONNSTR
|
||||
@@ -770,7 +737,7 @@ workflows:
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
requires:
|
||||
- build-neon-release
|
||||
- build-zenith-release
|
||||
- coverage-report:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
@@ -818,33 +785,14 @@ workflows:
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- deploy-neon-stress:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
- deploy-neon-stress-proxy:
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- docker-image-release:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# Build image only for commits to f88fe021-import-patch
|
||||
# Build image only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- f88fe021-import-patch
|
||||
- release
|
||||
requires:
|
||||
- pg_regress-tests-release
|
||||
- other-tests-release
|
||||
@@ -862,11 +810,11 @@ workflows:
|
||||
- deploy-release:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# deploy only for commits to f88fe021-import-patch
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- f88fe021-import-patch
|
||||
- release
|
||||
requires:
|
||||
- docker-image-release
|
||||
- deploy-release-proxy:
|
||||
@@ -885,6 +833,6 @@ workflows:
|
||||
# XXX: Successful build doesn't mean everything is OK, but
|
||||
# the job to be triggered takes so much time to complete (~22 min)
|
||||
# that it's better not to wait for the commented-out steps
|
||||
- build-neon-release
|
||||
- build-zenith-release
|
||||
# - pg_regress-tests-release
|
||||
# - other-tests-release
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
fullnameOverride: "neon-stress-proxy-scram"
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-stress-console.local/management/api/v2"
|
||||
domain: "*.stress.neon.tech"
|
||||
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: staging
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: ireland
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,34 +0,0 @@
|
||||
fullnameOverride: "neon-stress-proxy"
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.dev.neon.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy
|
||||
zenith_env: staging
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: ireland
|
||||
|
||||
service:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
|
||||
type: LoadBalancer
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,24 +0,0 @@
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.cloud.neon.tech"
|
||||
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: production
|
||||
zenith_region: us-west-2
|
||||
zenith_region_slug: oregon
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,6 +1,9 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.neon.tech/psql_session/"
|
||||
authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.zenith.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -22,7 +25,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: start.zenith.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: staging
|
||||
zenith_region: us-east-1
|
||||
zenith_region_slug: virginia
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,12 +1,9 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.zenith.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -20,7 +17,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
@@ -10,8 +10,6 @@ dep-format-version = "2"
|
||||
# Hakari works much better with the new feature resolver.
|
||||
# For more about the new feature resolver, see:
|
||||
# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver
|
||||
# Have to keep the resolver still here since hakari requires this field,
|
||||
# despite it's now the default for 2021 edition & cargo.
|
||||
resolver = "2"
|
||||
|
||||
# Add triples corresponding to platforms commonly used by developers here.
|
||||
|
||||
40
.github/workflows/testing.yml
vendored
40
.github/workflows/testing.yml
vendored
@@ -1,8 +1,6 @@
|
||||
name: Build and Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
@@ -34,11 +32,12 @@ jobs:
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev
|
||||
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
|
||||
|
||||
- name: Install macOS postgres dependencies
|
||||
- name: Install macOs postgres dependencies
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: brew install flex bison openssl
|
||||
run: |
|
||||
brew install flex bison
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
@@ -52,26 +51,10 @@ jobs:
|
||||
tmp_install/
|
||||
key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
|
||||
|
||||
- name: Set extra env for macOS
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: |
|
||||
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
|
||||
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
|
||||
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: make postgres
|
||||
|
||||
# Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
|
||||
# and the real cause will be inside config.log
|
||||
- name: Print configure logs in case of failure
|
||||
if: failure()
|
||||
continue-on-error: true
|
||||
run: |
|
||||
echo '' && echo '=== config.log ===' && echo ''
|
||||
cat tmp_install/build/config.log
|
||||
echo '' && echo '=== configure.log ===' && echo ''
|
||||
cat tmp_install/build/configure.log
|
||||
make postgres
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
@@ -81,10 +64,13 @@ jobs:
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
# Use `env CARGO_INCREMENTAL=0` to mitigate https://github.com/rust-lang/rust/issues/91696 for rustc 1.57.0
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo build --workspace --bins --examples --tests
|
||||
|
||||
- name: Run cargo test
|
||||
run: cargo test --all --all-targets
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo test -- --nocapture --test-threads=1
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,6 +11,3 @@ test_output/
|
||||
# Coverage
|
||||
*.profraw
|
||||
*.profdata
|
||||
|
||||
*.key
|
||||
*.crt
|
||||
|
||||
20
COPYRIGHT
Normal file
20
COPYRIGHT
Normal file
@@ -0,0 +1,20 @@
|
||||
This software is licensed under the Apache 2.0 License:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
Copyright 2021 Zenith Labs, Inc
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
The PostgreSQL submodule in vendor/postgres is licensed under the
|
||||
PostgreSQL license. See vendor/postgres/COPYRIGHT.
|
||||
1119
Cargo.lock
generated
1119
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -3,12 +3,15 @@ members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"workspace_hack",
|
||||
"neon_local",
|
||||
"libs/*",
|
||||
"zenith",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
]
|
||||
resolver = "2"
|
||||
|
||||
[profile.release]
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
@@ -18,4 +21,4 @@ debug = true
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
|
||||
@@ -26,9 +26,7 @@ COPY . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats.
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build --release \
|
||||
&& cachepot -s
|
||||
RUN mold -run cargo build --release && cachepot -s
|
||||
|
||||
# Build final image
|
||||
#
|
||||
|
||||
@@ -8,11 +8,9 @@ ARG AWS_SECRET_ACCESS_KEY
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \
|
||||
&& cachepot -s
|
||||
RUN mold -run cargo build -p compute_tools --release && cachepot -s
|
||||
|
||||
# Final image that only has one binary
|
||||
FROM debian:buster-slim
|
||||
|
||||
COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
12
Makefile
12
Makefile
@@ -12,21 +12,15 @@ endif
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# macOS with brew-installed openssl requires explicit paths
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib
|
||||
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
|
||||
5
NOTICE
5
NOTICE
@@ -1,5 +0,0 @@
|
||||
Neon
|
||||
Copyright 2022 Neon Inc.
|
||||
|
||||
The PostgreSQL submodule in vendor/postgres is licensed under the
|
||||
PostgreSQL license. See vendor/postgres/COPYRIGHT.
|
||||
130
README.md
130
README.md
@@ -23,100 +23,61 @@ Pageserver consists of:
|
||||
|
||||
## Running local installation
|
||||
|
||||
|
||||
#### building on Ubuntu/ Debian (Linux)
|
||||
1. Install build dependencies and other useful packages
|
||||
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
[Rust] 1.58 or later is also required.
|
||||
|
||||
3. Install PostgreSQL Client
|
||||
```
|
||||
apt install postgresql-client
|
||||
```
|
||||
|
||||
4. Build neon and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
make -j5
|
||||
```
|
||||
|
||||
#### building on OSX (12.3.1)
|
||||
1. Install XCode and dependencies
|
||||
```
|
||||
xcode-select --install
|
||||
brew install protobuf etcd
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
|
||||
3. Install PostgreSQL Client
|
||||
```
|
||||
# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos
|
||||
brew install libpq
|
||||
brew link --force libpq
|
||||
```
|
||||
|
||||
4. Build neon and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
make -j5
|
||||
```
|
||||
|
||||
#### dependency installation notes
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
|
||||
Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
|
||||
|
||||
2. Build neon and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
make -j5
|
||||
```
|
||||
|
||||
#### running neon database
|
||||
1. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
3. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/neon_local init
|
||||
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
||||
created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
|
||||
initial timeline de200bd42b49cc1814412c7e592dd6e9 created
|
||||
> ./target/debug/zenith init
|
||||
initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
|
||||
created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
|
||||
created main branch
|
||||
pageserver init succeeded
|
||||
|
||||
# start pageserver and safekeeper
|
||||
> ./target/debug/neon_local start
|
||||
Starting pageserver at '127.0.0.1:64000' in '.zenith'
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at 'localhost:64000' in '.zenith'
|
||||
Pageserver started
|
||||
initializing for sk 1 for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
|
||||
initializing for single for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
|
||||
Safekeeper started
|
||||
|
||||
# start postgres compute node
|
||||
> ./target/debug/neon_local pg start main
|
||||
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
|
||||
waiting for server to start.... done
|
||||
server started
|
||||
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running
|
||||
> ./target/debug/zenith pg list
|
||||
NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running
|
||||
```
|
||||
|
||||
2. Now it is possible to connect to postgres and run some queries:
|
||||
4. Now it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
@@ -130,28 +91,21 @@ postgres=# select * from t;
|
||||
(1 row)
|
||||
```
|
||||
|
||||
3. And create branches and run postgres on them:
|
||||
5. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/neon_local timeline branch --branch-name migration_check
|
||||
Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
|
||||
> ./target/debug/zenith timeline branch --branch-name migration_check
|
||||
Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
|
||||
|
||||
# check branches tree
|
||||
> ./target/debug/neon_local timeline list
|
||||
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
||||
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
||||
> ./target/debug/zenith timeline list
|
||||
main [5b014a9e41b4b63ce1a1febc04503636]
|
||||
┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
|
||||
|
||||
# start postgres on that branch
|
||||
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
||||
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres'
|
||||
|
||||
# check the new list of running postgres instances
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
|
||||
migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
|
||||
> ./target/debug/zenith pg start migration_check
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
@@ -164,20 +118,12 @@ postgres=# select * from t;
|
||||
|
||||
postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
|
||||
# check that the new change doesn't affect the 'main' postgres
|
||||
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
you have just started. You can stop them all with one command:
|
||||
```sh
|
||||
> ./target/debug/neon_local stop
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
@@ -11,11 +11,11 @@ clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
log = { version = "0.4", features = ["std", "serde"] }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
regex = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tar = "0.4"
|
||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Compute node tools
|
||||
|
||||
Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
`ExecStart` option. It will handle all the `Neon` specifics during compute node
|
||||
Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
`ExecStart` option. It will handle all the `zenith` specifics during compute node
|
||||
initialization:
|
||||
- `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
- `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
- Every start is a fresh start, so the data directory is removed and
|
||||
initialized again on each run.
|
||||
- Next it will put configuration files into the `PGDATA` directory.
|
||||
@@ -13,18 +13,18 @@ initialization:
|
||||
- Check and alter/drop/create roles and databases.
|
||||
- Hang waiting on the `postmaster` process to exit.
|
||||
|
||||
Also `compute_ctl` spawns two separate service threads:
|
||||
Also `zenith_ctl` spawns two separate service threads:
|
||||
- `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
into the shared `ComputeNode`;
|
||||
into the shared `ComputeState`;
|
||||
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
last activity requests.
|
||||
|
||||
Usage example:
|
||||
```sh
|
||||
compute_ctl -D /var/db/postgres/compute \
|
||||
-C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
-S /var/db/postgres/specs/current.json \
|
||||
-b /usr/local/bin/postgres
|
||||
zenith_ctl -D /var/db/postgres/compute \
|
||||
-C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
-S /var/db/postgres/specs/current.json \
|
||||
-b /usr/local/bin/postgres
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
@@ -1,174 +0,0 @@
|
||||
//!
|
||||
//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
//! `ExecStart` option. It will handle all the `Neon` specifics during compute node
|
||||
//! initialization:
|
||||
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
//! - Try to start `postgres` and wait until it is ready to accept connections.
|
||||
//! - Check and alter/drop/create roles and databases.
|
||||
//! - Hang waiting on the `postmaster` process to exit.
|
||||
//!
|
||||
//! Also `compute_ctl` spawns two separate service threads:
|
||||
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
//! into the shared `ComputeNode`;
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use log::{error, info};
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
||||
let matches = clap::App::new("compute_ctl")
|
||||
.version(version.unwrap_or("unknown"))
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.value_of("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.value_of("spec");
|
||||
let spec_path = matches.value_of("spec-path");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
||||
|
||||
let spec: ComputeSpec = match spec {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => serde_json::from_str(json)?,
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
} else {
|
||||
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.page_server_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_tenant")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_timeline")
|
||||
.expect("tenant id should be provided");
|
||||
|
||||
let compute_state = ComputeNode {
|
||||
start_time: Utc::now(),
|
||||
connstr: connstr.to_string(),
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
tenant,
|
||||
timeline,
|
||||
pageserver_connstr,
|
||||
metrics: ComputeMetrics::new(),
|
||||
state: RwLock::new(ComputeState::new()),
|
||||
};
|
||||
let compute = Arc::new(compute_state);
|
||||
|
||||
// Launch service threads first, so we were able to serve availability
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
|
||||
// Run compute (Postgres) and hang waiting on it.
|
||||
match compute.prepare_and_run() {
|
||||
Ok(ec) => {
|
||||
let code = ec.code().unwrap_or(1);
|
||||
info!("Postgres exited with code {}, shutting down", code);
|
||||
exit(code)
|
||||
}
|
||||
Err(error) => {
|
||||
error!("could not start the compute node: {}", error);
|
||||
|
||||
let mut state = compute.state.write().unwrap();
|
||||
state.error = Some(format!("{:?}", error));
|
||||
state.status = ComputeStatus::Failed;
|
||||
drop(state);
|
||||
|
||||
// Keep serving HTTP requests, so the cloud control plane was able to
|
||||
// get the actual error.
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
info!("shutting down");
|
||||
Err(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
251
compute_tools/src/bin/zenith_ctl.rs
Normal file
251
compute_tools/src/bin/zenith_ctl.rs
Normal file
@@ -0,0 +1,251 @@
|
||||
//!
|
||||
//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
//! `ExecStart` option. It will handle all the `zenith` specifics during compute node
|
||||
//! initialization:
|
||||
//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
//! - Try to start `postgres` and wait until it is ready to accept connections.
|
||||
//! - Check and alter/drop/create roles and databases.
|
||||
//! - Hang waiting on the `postmaster` process to exit.
|
||||
//!
|
||||
//! Also `zenith_ctl` spawns two separate service threads:
|
||||
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
//! into the shared `ComputeState`;
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
//! zenith_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::{exit, Command, ExitStatus};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use log::info;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use compute_tools::checker::create_writablity_check_data;
|
||||
use compute_tools::config;
|
||||
use compute_tools::http_api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::zenith::*;
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
|
||||
let state = state.read().unwrap();
|
||||
let spec = &state.spec;
|
||||
let pgdata_path = Path::new(&state.pgdata);
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.page_server_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_tenant")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_timeline")
|
||||
.expect("tenant id should be provided");
|
||||
|
||||
info!(
|
||||
"starting cluster #{}, operation #{}",
|
||||
spec.cluster.cluster_id,
|
||||
spec.operation_uuid.as_ref().unwrap()
|
||||
);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
create_pgdata(&state.pgdata)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, pageserver_connstr
|
||||
);
|
||||
get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, pageserver_connstr
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
|
||||
let read_state = state.read().unwrap();
|
||||
let pgdata_path = Path::new(&read_state.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&read_state.pgbin)
|
||||
.args(&["-D", &read_state.pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Try default Postgres port if it is not provided
|
||||
let port = read_state
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("port")
|
||||
.unwrap_or_else(|| "5432".to_string());
|
||||
wait_for_postgres(&port, pgdata_path)?;
|
||||
|
||||
let mut client = Client::connect(&read_state.connstr, NoTls)?;
|
||||
|
||||
handle_roles(&read_state.spec, &mut client)?;
|
||||
handle_databases(&read_state.spec, &mut client)?;
|
||||
create_writablity_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
info!(
|
||||
"finished configuration of cluster #{}",
|
||||
read_state.spec.cluster.cluster_id
|
||||
);
|
||||
|
||||
// Release the read lock.
|
||||
drop(read_state);
|
||||
|
||||
// Get the write lock, update state and release the lock, so HTTP API
|
||||
// was able to serve requests, while we are blocked waiting on
|
||||
// Postgres.
|
||||
let mut state = state.write().unwrap();
|
||||
state.ready = true;
|
||||
drop(state);
|
||||
|
||||
// Wait for child postgres process basically forever. In this state Ctrl+C
|
||||
// will be propagated to postgres and it will be shut down as well.
|
||||
let ecode = pg.wait().expect("failed to wait on postgres");
|
||||
|
||||
Ok(ecode)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `zenith_utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
||||
let matches = clap::App::new("zenith_ctl")
|
||||
.version(version.unwrap_or("unknown"))
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.value_of("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.value_of("spec");
|
||||
let spec_path = matches.value_of("spec-path");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
||||
|
||||
let spec: ClusterSpec = match spec {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => serde_json::from_str(json)?,
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
} else {
|
||||
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let compute_state = ComputeState {
|
||||
connstr: connstr.to_string(),
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
ready: false,
|
||||
last_active: Utc::now(),
|
||||
};
|
||||
let compute_state = Arc::new(RwLock::new(compute_state));
|
||||
|
||||
// Launch service threads first, so we were able to serve availability
|
||||
// requests, while configuration is still in progress.
|
||||
let mut _threads = vec![
|
||||
launch_http_server(&compute_state).expect("cannot launch compute monitor thread"),
|
||||
launch_monitor(&compute_state).expect("cannot launch http endpoint thread"),
|
||||
];
|
||||
|
||||
prepare_pgdata(&compute_state)?;
|
||||
|
||||
// Run compute (Postgres) and hang waiting on it. Panic if any error happens,
|
||||
// it will help us to trigger unwind and kill postmaster as well.
|
||||
match run_compute(&compute_state) {
|
||||
Ok(ec) => exit(ec.success() as i32),
|
||||
Err(error) => panic!("cannot start compute node, error: {}", error),
|
||||
}
|
||||
}
|
||||
@@ -1,11 +1,11 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use log::error;
|
||||
use postgres::Client;
|
||||
use tokio_postgres::NoTls;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::zenith::ComputeState;
|
||||
|
||||
pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
||||
let query = "
|
||||
@@ -23,9 +23,9 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
|
||||
let connstr = &compute.connstr;
|
||||
let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
|
||||
pub async fn check_writability(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
|
||||
let connstr = state.read().unwrap().connstr.clone();
|
||||
let (client, connection) = tokio_postgres::connect(&connstr, NoTls).await?;
|
||||
if client.is_closed() {
|
||||
return Err(anyhow!("connection to postgres closed"));
|
||||
}
|
||||
|
||||
@@ -1,315 +0,0 @@
|
||||
//
|
||||
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
|
||||
// but there are several things that makes `PostgresNode` usage inconvenient in the
|
||||
// cloud:
|
||||
// - it inherits from `LocalEnv`, which contains **all-all** the information about
|
||||
// a complete service running
|
||||
// - it uses `PageServerNode` with information about http endpoint, which we do not
|
||||
// need in the cloud again
|
||||
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
|
||||
//
|
||||
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
|
||||
// attributes (not required for the cloud). Yet, it is still tempting to unify these
|
||||
// `PostgresNode` and `ComputeNode` and use one in both places.
|
||||
//
|
||||
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
|
||||
//
|
||||
use std::fs;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, ExitStatus, Stdio};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use log::info;
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Serialize, Serializer};
|
||||
|
||||
use crate::checker::create_writablity_check_data;
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
pub start_time: DateTime<Utc>,
|
||||
pub connstr: String,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant: String,
|
||||
pub timeline: String,
|
||||
pub pageserver_connstr: String,
|
||||
pub metrics: ComputeMetrics,
|
||||
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
|
||||
/// to allow HTTP API server to serve status requests, while configuration
|
||||
/// is in progress.
|
||||
pub state: RwLock<ComputeState>,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
x.to_rfc3339().serialize(s)
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
/// Timestamp of the last Postgres activity
|
||||
#[serde(serialize_with = "rfc3339_serialize")]
|
||||
pub last_active: DateTime<Utc>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
impl ComputeState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
status: ComputeStatus::Init,
|
||||
last_active: Utc::now(),
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
Init,
|
||||
Running,
|
||||
Failed,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct ComputeMetrics {
|
||||
pub sync_safekeepers_ms: AtomicU64,
|
||||
pub basebackup_ms: AtomicU64,
|
||||
pub config_ms: AtomicU64,
|
||||
pub total_startup_ms: AtomicU64,
|
||||
}
|
||||
|
||||
impl ComputeMetrics {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
sync_safekeepers_ms: AtomicU64::new(0),
|
||||
basebackup_ms: AtomicU64::new(0),
|
||||
config_ms: AtomicU64::new(0),
|
||||
total_startup_ms: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
self.state.write().unwrap().status = status;
|
||||
}
|
||||
|
||||
pub fn get_status(&self) -> ComputeStatus {
|
||||
self.state.read().unwrap().status
|
||||
}
|
||||
|
||||
// Remove `pgdata` directory and create it again with right permissions.
|
||||
fn create_pgdata(&self) -> Result<()> {
|
||||
// Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
|
||||
// If it is something different then create_dir() will error out anyway.
|
||||
let _ok = fs::remove_dir_all(&self.pgdata);
|
||||
fs::create_dir(&self.pgdata)?;
|
||||
fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
|
||||
ar.unpack(&self.pgdata)?;
|
||||
|
||||
self.metrics.basebackup_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
// and return the reported LSN back to the caller.
|
||||
fn sync_safekeepers(&self) -> Result<String> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
.args(&["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}",
|
||||
sync_output.status,
|
||||
);
|
||||
}
|
||||
|
||||
self.metrics.sync_safekeepers_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
pub fn prepare_pgdata(&self) -> Result<()> {
|
||||
let spec = &self.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers()
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(&lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
pub fn run(&self) -> Result<ExitStatus> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(&["-D", &self.pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Try default Postgres port if it is not provided
|
||||
let port = self
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("port")
|
||||
.unwrap_or_else(|| "5432".to_string());
|
||||
wait_for_postgres(&mut pg, &port, pgdata_path)?;
|
||||
|
||||
let mut client = Client::connect(&self.connstr, NoTls)?;
|
||||
|
||||
handle_roles(&self.spec, &mut client)?;
|
||||
handle_databases(&self.spec, &mut client)?;
|
||||
handle_grants(&self.spec, &mut client)?;
|
||||
create_writablity_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
let startup_end_time = Utc::now();
|
||||
|
||||
self.metrics.config_ms.store(
|
||||
startup_end_time
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
self.metrics.total_startup_ms.store(
|
||||
startup_end_time
|
||||
.signed_duration_since(self.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
self.set_status(ComputeStatus::Running);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
self.spec.cluster.cluster_id
|
||||
);
|
||||
|
||||
// Wait for child Postgres process basically forever. In this state Ctrl+C
|
||||
// will propagate to Postgres and it will be shut down as well.
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
|
||||
Ok(ecode)
|
||||
}
|
||||
|
||||
pub fn prepare_and_run(&self) -> Result<ExitStatus> {
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||
self.spec.cluster.cluster_id,
|
||||
self.spec.operation_uuid.as_ref().unwrap(),
|
||||
self.tenant,
|
||||
self.timeline,
|
||||
);
|
||||
|
||||
self.prepare_pgdata()?;
|
||||
self.run()
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::pg_helpers::PgOptionsSerialize;
|
||||
use crate::spec::ComputeSpec;
|
||||
use crate::zenith::ClusterSpec;
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
@@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut postgres_conf = File::create(path)?;
|
||||
|
||||
write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
|
||||
write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write Postgres config block wrapped with generated comment section
|
||||
fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
|
||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||
fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> {
|
||||
writeln!(file, "# Managed by Zenith: begin")?;
|
||||
writeln!(file, "{}", buf)?;
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
writeln!(file, "# Managed by Zenith: end")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
pub mod api;
|
||||
@@ -1,158 +0,0 @@
|
||||
openapi: "3.0.2"
|
||||
info:
|
||||
title: Compute node control API
|
||||
version: "1.0"
|
||||
|
||||
servers:
|
||||
- url: "http://localhost:3080"
|
||||
|
||||
paths:
|
||||
/status:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get compute node internal status
|
||||
description: ""
|
||||
operationId: getComputeStatus
|
||||
responses:
|
||||
"200":
|
||||
description: ComputeState
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeState"
|
||||
|
||||
/metrics.json:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get compute node startup metrics in JSON format
|
||||
description: ""
|
||||
operationId: getComputeMetricsJSON
|
||||
responses:
|
||||
"200":
|
||||
description: ComputeMetrics
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/ready:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Check whether compute startup process finished successfully
|
||||
description: ""
|
||||
operationId: computeIsReady
|
||||
responses:
|
||||
"200":
|
||||
description: Compute is ready ('true') or not ('false')
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "true"
|
||||
|
||||
/last_activity:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get timestamp of the last compute activity
|
||||
description: ""
|
||||
operationId: getLastComputeActivityTS
|
||||
responses:
|
||||
"200":
|
||||
description: Timestamp of the last compute activity
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
|
||||
/check_writability:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritabilityDeprecated
|
||||
responses:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
example: "true"
|
||||
|
||||
post:
|
||||
tags:
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritability
|
||||
responses:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
example: "true"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
type: http
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
|
||||
schemas:
|
||||
ComputeMetrics:
|
||||
type: object
|
||||
description: Compute startup metrics
|
||||
required:
|
||||
- sync_safekeepers_ms
|
||||
- basebackup_ms
|
||||
- config_ms
|
||||
- total_startup_ms
|
||||
properties:
|
||||
sync_safekeepers_ms:
|
||||
type: integer
|
||||
basebackup_ms:
|
||||
type: integer
|
||||
config_ms:
|
||||
type: integer
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
ComputeState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- last_active
|
||||
properties:
|
||||
status:
|
||||
$ref: '#/components/schemas/ComputeStatus'
|
||||
last_active:
|
||||
type: string
|
||||
description: The last detected compute activity timestamp in UTC and RFC3339 format
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
error:
|
||||
type: string
|
||||
description: Text of the error during compute startup, if any
|
||||
|
||||
ComputeStatus:
|
||||
type: string
|
||||
enum:
|
||||
- init
|
||||
- failed
|
||||
- running
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
@@ -1,64 +1,37 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use log::{error, info};
|
||||
use serde_json;
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeStatus};
|
||||
use crate::zenith::*;
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
|
||||
async fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Timestamp of the last Postgres activity in the plain text.
|
||||
// DEPRECATED in favour of /status
|
||||
(&Method::GET, "/last_activity") => {
|
||||
info!("serving /last_active GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
let state = state.read().unwrap();
|
||||
|
||||
// Use RFC3339 format for consistency.
|
||||
Response::new(Body::from(state.last_active.to_rfc3339()))
|
||||
}
|
||||
|
||||
// Has compute setup process finished? -> true/false.
|
||||
// DEPRECATED in favour of /status
|
||||
// Has compute setup process finished? -> true/false
|
||||
(&Method::GET, "/ready") => {
|
||||
info!("serving /ready GET request");
|
||||
let status = compute.get_status();
|
||||
Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
|
||||
let state = state.read().unwrap();
|
||||
Response::new(Body::from(format!("{}", state.ready)))
|
||||
}
|
||||
|
||||
// Serialized compute state.
|
||||
(&Method::GET, "/status") => {
|
||||
info!("serving /status GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
|
||||
}
|
||||
|
||||
// Startup metrics in JSON format. Keep /metrics reserved for a possible
|
||||
// future use for Prometheus metrics format.
|
||||
(&Method::GET, "/metrics.json") => {
|
||||
info!("serving /metrics.json GET request");
|
||||
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
|
||||
}
|
||||
|
||||
// DEPRECATED, use POST instead
|
||||
(&Method::GET, "/check_writability") => {
|
||||
info!("serving /check_writability GET request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
let res = crate::checker::check_writability(&state).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
@@ -76,7 +49,7 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(state: Arc<ComputeNode>) {
|
||||
async fn serve(state: Arc<RwLock<ComputeState>>) {
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
@@ -100,7 +73,7 @@ async fn serve(state: Arc<ComputeNode>) {
|
||||
}
|
||||
|
||||
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
pub fn launch_http_server(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
@@ -4,12 +4,11 @@
|
||||
//!
|
||||
pub mod checker;
|
||||
pub mod config;
|
||||
pub mod http;
|
||||
pub mod http_api;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
#[allow(clippy::format_push_string)] // Clippy's suggestion doesn't actually work
|
||||
pub mod pg_helpers;
|
||||
pub mod spec;
|
||||
pub mod zenith;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{thread, time};
|
||||
|
||||
use anyhow::Result;
|
||||
@@ -6,16 +6,16 @@ use chrono::{DateTime, Utc};
|
||||
use log::{debug, info};
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::zenith::ComputeState;
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
|
||||
|
||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
// Then update it in the shared state. This function never errors out.
|
||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||
fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.clone();
|
||||
let connstr = state.read().unwrap().connstr.clone();
|
||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||
let mut client = Client::connect(&connstr, NoTls);
|
||||
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
||||
@@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
let mut last_active = compute.state.read().unwrap().last_active;
|
||||
let mut last_active = state.read().unwrap().last_active;
|
||||
|
||||
if let Ok(backs) = backends {
|
||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||
@@ -83,14 +83,14 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
|
||||
// Update the last activity in the shared state if we got a more recent one.
|
||||
let mut state = compute.state.write().unwrap();
|
||||
let mut state = state.write().unwrap();
|
||||
if last_active > state.last_active {
|
||||
state.last_active = last_active;
|
||||
debug!("set the last compute activity time to: {}", last_active);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("cannot connect to postgres: {}, retrying", e);
|
||||
info!("cannot connect to postgres: {}, retrying", e);
|
||||
|
||||
// Establish a new connection and try again.
|
||||
client = Client::connect(&connstr, NoTls);
|
||||
@@ -100,7 +100,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
pub fn launch_monitor(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Child;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::{fs, thread, time};
|
||||
|
||||
@@ -222,12 +220,12 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
||||
/// Wait for Postgres to become ready to accept connections:
|
||||
/// - state should be `ready` in the `pgdata/postmaster.pid`
|
||||
/// - and we should be able to connect to 127.0.0.1:5432
|
||||
pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
|
||||
pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
|
||||
let pid_path = pgdata.join("postmaster.pid");
|
||||
let mut slept: u64 = 0; // ms
|
||||
let pause = time::Duration::from_millis(100);
|
||||
|
||||
let timeout = time::Duration::from_millis(10);
|
||||
let timeout = time::Duration::from_millis(200);
|
||||
let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
|
||||
|
||||
loop {
|
||||
@@ -238,19 +236,14 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
|
||||
bail!("timed out while waiting for Postgres to start");
|
||||
}
|
||||
|
||||
if let Ok(Some(status)) = pg.try_wait() {
|
||||
// Postgres exited, that is not what we expected, bail out earlier.
|
||||
let code = status.code().unwrap_or(-1);
|
||||
bail!("Postgres exited unexpectedly with code {}", code);
|
||||
}
|
||||
|
||||
if pid_path.exists() {
|
||||
let file = BufReader::new(File::open(&pid_path)?);
|
||||
let status = file
|
||||
.lines()
|
||||
.last()
|
||||
.unwrap()
|
||||
.unwrap_or_else(|_| "unknown".to_string());
|
||||
// XXX: dumb and the simplest way to get the last line in a text file
|
||||
// TODO: better use `.lines().last()` later
|
||||
let stdout = Command::new("tail")
|
||||
.args(&["-n1", pid_path.to_str().unwrap()])
|
||||
.output()?
|
||||
.stdout;
|
||||
let status = String::from_utf8(stdout)?;
|
||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
||||
|
||||
// Now Postgres is ready to accept connections
|
||||
|
||||
@@ -3,53 +3,16 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
use log::{info, log_enabled, warn, Level};
|
||||
use postgres::Client;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::config;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
}
|
||||
|
||||
/// Cluster state seen from the perspective of the external tools
|
||||
/// like Rails web console.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
}
|
||||
use crate::zenith::ClusterSpec;
|
||||
|
||||
/// It takes cluster specification and does the following:
|
||||
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
|
||||
/// - Update `pg_hba.conf` to allow external connections.
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
@@ -76,7 +39,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
|
||||
/// Given a cluster spec json and open transaction it handles roles creation,
|
||||
/// deletion and update.
|
||||
pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
let mut xact = client.transaction()?;
|
||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||
|
||||
@@ -173,20 +136,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
} else {
|
||||
info!("role name: '{}'", &name);
|
||||
info!("role name {}", &name);
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.quote());
|
||||
info!("role create query: '{}'", &query);
|
||||
info!("role create query {}", &query);
|
||||
info_print!(" -> create");
|
||||
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
|
||||
let grant_query = format!(
|
||||
"grant pg_read_all_data, pg_write_all_data to {}",
|
||||
name.quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
info!("role grant query: '{}'", &grant_query);
|
||||
}
|
||||
|
||||
info_print!("\n");
|
||||
@@ -202,7 +158,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
||||
/// atomicity should be enough here due to the order of operations and various checks,
|
||||
/// which together provide us idempotency.
|
||||
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
|
||||
|
||||
// Print a list of existing Postgres databases (only in debug mode)
|
||||
@@ -288,24 +244,3 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Grant CREATE ON DATABASE to the database owner
|
||||
// to allow clients create trusted extensions.
|
||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
let dbname = &db.name;
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.quote(),
|
||||
db.owner.quote()
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
109
compute_tools/src/zenith.rs
Normal file
109
compute_tools/src/zenith.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
/// Compute node state shared across several `zenith_ctl` threads.
|
||||
/// Should be used under `RwLock` to allow HTTP API server to serve
|
||||
/// status requests, while configuration is in progress.
|
||||
pub struct ComputeState {
|
||||
pub connstr: String,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub spec: ClusterSpec,
|
||||
/// Compute setup process has finished
|
||||
pub ready: bool,
|
||||
/// Timestamp of the last Postgres activity
|
||||
pub last_active: DateTime<Utc>,
|
||||
}
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct ClusterSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
}
|
||||
|
||||
/// Cluster state seen from the perspective of the external tools
|
||||
/// like Rails web console.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
}
|
||||
|
||||
/// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
/// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
pub fn get_basebackup(
|
||||
pgdata: &str,
|
||||
connstr: &str,
|
||||
tenant: &str,
|
||||
timeline: &str,
|
||||
lsn: &str,
|
||||
) -> Result<()> {
|
||||
let mut client = Client::connect(connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", tenant, timeline, lsn),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
|
||||
ar.unpack(&pgdata)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
/// and return the reported LSN back to the caller.
|
||||
pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result<String> {
|
||||
let sync_handle = Command::new(&pgbin)
|
||||
.args(&["--sync-safekeepers"])
|
||||
.env("PGDATA", &pgdata) // we cannot use -D in this mode
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}",
|
||||
sync_output.status,
|
||||
);
|
||||
}
|
||||
|
||||
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
@@ -4,12 +4,12 @@ mod pg_helpers_tests {
|
||||
use std::fs::File;
|
||||
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::ComputeSpec;
|
||||
use compute_tools::zenith::ClusterSpec;
|
||||
|
||||
#[test]
|
||||
fn params_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.databases.first().unwrap().to_pg_options(),
|
||||
@@ -24,7 +24,7 @@ mod pg_helpers_tests {
|
||||
#[test]
|
||||
fn settings_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.settings.as_pg_settings(),
|
||||
|
||||
@@ -5,7 +5,7 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
toml = "0.5"
|
||||
@@ -19,5 +19,5 @@ reqwest = { version = "0.11", default-features = false, features = ["blocking",
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
safekeeper = { path = "../safekeeper" }
|
||||
utils = { path = "../libs/utils" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -9,6 +9,3 @@ auth_type = 'Trust'
|
||||
id = 1
|
||||
pg_port = 5454
|
||||
http_port = 7676
|
||||
|
||||
[etcd_broker]
|
||||
broker_endpoints = ['http://127.0.0.1:2379']
|
||||
|
||||
@@ -11,12 +11,11 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
@@ -273,9 +272,12 @@ impl PostgresNode {
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
conf.append("wal_keep_size", "0");
|
||||
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
|
||||
conf.append("restart_after_crash", "off");
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeeper or
|
||||
// page server yet. (gh issue #349)
|
||||
conf.append("wal_keep_size", "10TB");
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
|
||||
@@ -1,93 +0,0 @@
|
||||
use std::{
|
||||
fs,
|
||||
path::PathBuf,
|
||||
process::{Command, Stdio},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::{
|
||||
sys::signal::{kill, Signal},
|
||||
unistd::Pid,
|
||||
};
|
||||
|
||||
use crate::{local_env, read_pidfile};
|
||||
|
||||
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_broker = &env.etcd_broker;
|
||||
println!(
|
||||
"Starting etcd broker using {}",
|
||||
etcd_broker.etcd_binary_path.display()
|
||||
);
|
||||
|
||||
let etcd_data_dir = env.base_data_dir.join("etcd");
|
||||
fs::create_dir_all(&etcd_data_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd data dir: {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let etcd_stdout_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create ectd stout file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let etcd_stderr_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create ectd stderr file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let client_urls = etcd_broker.comma_separated_endpoints();
|
||||
|
||||
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
|
||||
.args(&[
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
])
|
||||
.stdout(Stdio::from(etcd_stdout_file))
|
||||
.stderr(Stdio::from(etcd_stderr_file))
|
||||
.spawn()
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
let pid = etcd_process.id();
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_path = &env.etcd_broker.etcd_binary_path;
|
||||
println!("Stopping etcd broker at {}", etcd_path.display());
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read etcd pid filea at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?);
|
||||
|
||||
kill(pid, Signal::SIGTERM).with_context(|| {
|
||||
format!(
|
||||
"Failed to stop etcd with pid {pid} at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
env.base_data_dir.join("etcd.pid")
|
||||
}
|
||||
@@ -12,7 +12,6 @@ use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
pub mod compute;
|
||||
pub mod etcd;
|
||||
pub mod local_env;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
//! script which will use local paths.
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
@@ -12,11 +11,9 @@ use std::env;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
@@ -60,7 +57,9 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub private_key_path: PathBuf,
|
||||
|
||||
pub etcd_broker: EtcdBroker,
|
||||
// A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
|
||||
#[serde(default)]
|
||||
pub broker_endpoints: Option<String>,
|
||||
|
||||
pub pageserver: PageServerConf,
|
||||
|
||||
@@ -76,62 +75,6 @@ pub struct LocalEnv {
|
||||
branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
|
||||
}
|
||||
|
||||
/// Etcd broker config for cluster internal communication.
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EtcdBroker {
|
||||
/// A prefix to all to any key when pushing/polling etcd from a node.
|
||||
#[serde(default)]
|
||||
pub broker_etcd_prefix: Option<String>,
|
||||
|
||||
/// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Vec<DisplayFromStr>")]
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
|
||||
/// Etcd binary path to use.
|
||||
#[serde(default)]
|
||||
pub etcd_binary_path: PathBuf,
|
||||
}
|
||||
|
||||
impl EtcdBroker {
|
||||
pub fn locate_etcd() -> anyhow::Result<PathBuf> {
|
||||
let which_output = Command::new("which")
|
||||
.arg("etcd")
|
||||
.output()
|
||||
.context("Failed to run 'which etcd' command")?;
|
||||
let stdout = String::from_utf8_lossy(&which_output.stdout);
|
||||
ensure!(
|
||||
which_output.status.success(),
|
||||
"'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
|
||||
which_output.status,
|
||||
String::from_utf8_lossy(&which_output.stderr)
|
||||
);
|
||||
|
||||
let etcd_path = PathBuf::from(stdout.trim());
|
||||
ensure!(
|
||||
etcd_path.is_file(),
|
||||
"'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
|
||||
etcd_path.display()
|
||||
);
|
||||
|
||||
Ok(etcd_path)
|
||||
}
|
||||
|
||||
pub fn comma_separated_endpoints(&self) -> String {
|
||||
self.broker_endpoints.iter().map(Url::as_str).fold(
|
||||
String::new(),
|
||||
|mut comma_separated_urls, url| {
|
||||
if !comma_separated_urls.is_empty() {
|
||||
comma_separated_urls.push(',');
|
||||
}
|
||||
comma_separated_urls.push_str(url);
|
||||
comma_separated_urls
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct PageServerConf {
|
||||
@@ -235,7 +178,12 @@ impl LocalEnv {
|
||||
if old_timeline_id == &timeline_id {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
|
||||
bail!(
|
||||
"branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
|
||||
branch_name,
|
||||
old_timeline_id,
|
||||
timeline_id
|
||||
);
|
||||
}
|
||||
} else {
|
||||
existing_values.push((tenant_id, timeline_id));
|
||||
@@ -271,7 +219,7 @@ impl LocalEnv {
|
||||
///
|
||||
/// Unlike 'load_config', this function fills in any defaults that are missing
|
||||
/// from the config file.
|
||||
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
|
||||
pub fn create_config(toml: &str) -> anyhow::Result<Self> {
|
||||
let mut env: LocalEnv = toml::from_str(toml)?;
|
||||
|
||||
// Find postgres binaries.
|
||||
@@ -284,11 +232,26 @@ impl LocalEnv {
|
||||
env.pg_distrib_dir = cwd.join("tmp_install")
|
||||
}
|
||||
}
|
||||
if !env.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
// Find zenith binaries.
|
||||
if env.zenith_distrib_dir == Path::new("") {
|
||||
env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !env.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
||||
binary,
|
||||
env.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// If no initial tenant ID was given, generate it.
|
||||
if env.default_tenant_id.is_none() {
|
||||
@@ -382,36 +345,6 @@ impl LocalEnv {
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.display()
|
||||
);
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
||||
binary,
|
||||
self.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{binary}' in zenith distrib dir '{}'",
|
||||
self.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
fs::create_dir(&base_path)?;
|
||||
|
||||
@@ -469,35 +402,7 @@ impl LocalEnv {
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val),
|
||||
None => PathBuf::from(".zenith"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_conf_parsing() {
|
||||
let simple_conf_toml = include_str!("../simple.conf");
|
||||
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
|
||||
assert!(
|
||||
simple_conf_parse_result.is_ok(),
|
||||
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
|
||||
);
|
||||
|
||||
let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
|
||||
let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
|
||||
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
|
||||
assert!(
|
||||
spoiled_url_toml.contains(spoiled_url_str),
|
||||
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
|
||||
);
|
||||
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
|
||||
assert!(
|
||||
spoiled_url_parse_result.is_err(),
|
||||
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
|
||||
);
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,15 +15,13 @@ use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use safekeeper::http::models::TimelineCreateRequest;
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
@@ -52,7 +50,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {url}.", status.as_u16()),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
}
|
||||
@@ -75,12 +73,16 @@ pub struct SafekeeperNode {
|
||||
pub http_base_url: String,
|
||||
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
|
||||
broker_endpoints: Option<String>,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let pageserver = Arc::new(PageServerNode::from_env(env));
|
||||
|
||||
println!("initializing for sk {} for {}", conf.id, conf.http_port);
|
||||
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
@@ -89,6 +91,7 @@ impl SafekeeperNode {
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
pageserver,
|
||||
broker_endpoints: env.broker_endpoints.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,13 +138,8 @@ impl SafekeeperNode {
|
||||
if !self.conf.sync {
|
||||
cmd.arg("--no-sync");
|
||||
}
|
||||
|
||||
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
|
||||
if !comma_separated_endpoints.is_empty() {
|
||||
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
|
||||
}
|
||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||
if let Some(ref ep) = self.broker_endpoints {
|
||||
cmd.args(&["--broker-endpoints", ep]);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
@@ -205,13 +203,12 @@ impl SafekeeperNode {
|
||||
let pid = Pid::from_raw(pid);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping safekeeper {} immediately..", self.id);
|
||||
println!("Stop safekeeper immediately");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping safekeeper {} gracefully..", self.id);
|
||||
println!("Stop safekeeper gracefully");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
@@ -233,35 +230,25 @@ impl SafekeeperNode {
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if safekeeper flushes a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for _ in 0..100 {
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
if err.kind() != io::ErrorKind::ConnectionRefused {
|
||||
eprintln!("\nSafekeeper connection failed with error: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
if tcp_stopped {
|
||||
// Also check status on the HTTP port
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Safekeeper stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
Ok(_) => {
|
||||
println!("Safekeeper status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!("\nSafekeeper status check failed with error: {err}");
|
||||
println!("Safekeeper status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => {
|
||||
// keep waiting
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Safekeeper still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::io::{BufReader, Write};
|
||||
use std::io::Write;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
use std::process::Command;
|
||||
@@ -11,23 +9,21 @@ use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::timelines::TimelineInfo;
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use pageserver::tenant_mgr::TenantInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
@@ -122,16 +118,6 @@ impl PageServerNode {
|
||||
);
|
||||
let listen_pg_addr_param =
|
||||
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
|
||||
let broker_endpoints_param = format!(
|
||||
"broker_endpoints=[{}]",
|
||||
self.env
|
||||
.etcd_broker
|
||||
.broker_endpoints
|
||||
.iter()
|
||||
.map(|url| format!("'{url}'"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
let mut args = Vec::with_capacity(20);
|
||||
|
||||
args.push("--init");
|
||||
@@ -140,19 +126,8 @@ impl PageServerNode {
|
||||
args.extend(["-c", &authg_type_param]);
|
||||
args.extend(["-c", &listen_http_addr_param]);
|
||||
args.extend(["-c", &listen_pg_addr_param]);
|
||||
args.extend(["-c", &broker_endpoints_param]);
|
||||
args.extend(["-c", &id]);
|
||||
|
||||
let broker_etcd_prefix_param = self
|
||||
.env
|
||||
.etcd_broker
|
||||
.broker_etcd_prefix
|
||||
.as_ref()
|
||||
.map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
|
||||
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
|
||||
args.extend(["-c", broker_etcd_prefix_param]);
|
||||
}
|
||||
|
||||
for config_override in config_overrides {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
@@ -189,9 +164,6 @@ impl PageServerNode {
|
||||
);
|
||||
}
|
||||
|
||||
// echo the captured output of the init command
|
||||
println!("{}", String::from_utf8_lossy(&init_output.stdout));
|
||||
|
||||
Ok(initial_timeline_id)
|
||||
}
|
||||
|
||||
@@ -211,6 +183,8 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
|
||||
let repo_path = self.repo_path();
|
||||
let mut args = vec!["-D", repo_path.to_str().unwrap()];
|
||||
|
||||
@@ -218,11 +192,9 @@ impl PageServerNode {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
filled_cmd = fill_aws_secrets_vars(filled_cmd);
|
||||
fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
|
||||
if !filled_cmd.status()?.success() {
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
@@ -282,13 +254,12 @@ impl PageServerNode {
|
||||
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping pageserver immediately..");
|
||||
println!("Stop pageserver immediately");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping pageserver gracefully..");
|
||||
println!("Stop pageserver gracefully");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
@@ -310,36 +281,25 @@ impl PageServerNode {
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for _ in 0..100 {
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
if err.kind() != io::ErrorKind::ConnectionRefused {
|
||||
eprintln!("\nPageserver connection failed with error: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
if tcp_stopped {
|
||||
// Also check status on the HTTP port
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Pageserver stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
Ok(_) => {
|
||||
println!("Pageserver status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!("\nPageserver status check failed with error: {err}");
|
||||
println!("Pageserver status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => {
|
||||
// keep waiting
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Pageserver still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
@@ -382,36 +342,10 @@ impl PageServerNode {
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: Option<ZTenantId>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<Option<ZTenantId>> {
|
||||
let tenant_id_string = self
|
||||
.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
|
||||
.json(&TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.json(&TenantCreateRequest { new_tenant_id })
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<String>>()?;
|
||||
@@ -428,35 +362,6 @@ impl PageServerNode {
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> {
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
|
||||
.json(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
let timeline_infos: Vec<TimelineInfo> = self
|
||||
.http_request(
|
||||
@@ -493,63 +398,4 @@ impl PageServerNode {
|
||||
|
||||
Ok(timeline_info_response)
|
||||
}
|
||||
|
||||
/// Import a basebackup prepared using either:
|
||||
/// a) `pg_basebackup -F tar`, or
|
||||
/// b) The `fullbackup` pageserver endpoint
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `tenant_id` - tenant to import into. Created if not exists
|
||||
/// * `timeline_id` - id to assign to imported timeline
|
||||
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
|
||||
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
|
||||
pub fn timeline_import(
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
base: (Lsn, PathBuf),
|
||||
pg_wal: Option<(Lsn, PathBuf)>,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
|
||||
|
||||
// Init base reader
|
||||
let (start_lsn, base_tarfile_path) = base;
|
||||
let base_tarfile = File::open(base_tarfile_path)?;
|
||||
let mut base_reader = BufReader::new(base_tarfile);
|
||||
|
||||
// Init wal reader if necessary
|
||||
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
|
||||
let wal_tarfile = File::open(wal_tarfile_path)?;
|
||||
let wal_reader = BufReader::new(wal_tarfile);
|
||||
(end_lsn, Some(wal_reader))
|
||||
} else {
|
||||
(start_lsn, None)
|
||||
};
|
||||
|
||||
// Import base
|
||||
let import_cmd =
|
||||
format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||
let mut writer = client.copy_in(&import_cmd)?;
|
||||
io::copy(&mut base_reader, &mut writer)?;
|
||||
writer.finish()?;
|
||||
|
||||
// Import wal if necessary
|
||||
if let Some(mut wal_reader) = wal_reader {
|
||||
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
|
||||
let mut writer = client.copy_in(&import_cmd)?;
|
||||
io::copy(&mut wal_reader, &mut writer)?;
|
||||
writer.finish()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
@@ -1,20 +1,13 @@
|
||||
#!/bin/sh
|
||||
set -eux
|
||||
|
||||
broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
|
||||
if [ "$broker_endpoints_param" != "absent" ]; then
|
||||
broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
|
||||
else
|
||||
broker_endpoints_param=''
|
||||
fi
|
||||
|
||||
if [ "$1" = 'pageserver' ]; then
|
||||
if [ ! -d "/data/tenants" ]; then
|
||||
echo "Initializing pageserver data directory"
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10"
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
|
||||
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
|
||||
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
|
||||
- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout.
|
||||
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
|
||||
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
|
||||
- [safekeeper/README](/safekeeper/README) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -27,4 +27,4 @@ management_token = jwt.encode({"scope": "pageserverapi"}, auth_keys.priv, algori
|
||||
tenant_token = jwt.encode({"scope": "tenant", "tenant_id": ps.initial_tenant}, auth_keys.priv, algorithm="RS256")
|
||||
```
|
||||
|
||||
Utility functions to work with jwts in rust are located in libs/utils/src/auth.rs
|
||||
Utility functions to work with jwts in rust are located in zenith_utils/src/auth.rs
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
# Docker images of Neon
|
||||
# Docker images of Zenith
|
||||
|
||||
## Images
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
|
||||
|
||||
And additional intermediate image:
|
||||
And additional intermediate images:
|
||||
|
||||
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
|
||||
- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
|
||||
|
||||
## Building pipeline
|
||||
|
||||
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
|
||||
1. Image `zenithdb/compute-tools` is re-built automatically.
|
||||
|
||||
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
|
||||
2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
|
||||
|
||||
2. `neondatabase/neon`
|
||||
3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
|
||||
|
||||
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.
|
||||
|
||||
### Branch
|
||||
|
||||
We can create branch at certain LSN using `neon_local timeline branch` command.
|
||||
We can create branch at certain LSN using `zenith timeline branch` command.
|
||||
Each Branch lives in a corresponding timeline[] and has an ancestor[].
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ The layer map tracks what layers exist in a timeline.
|
||||
|
||||
### Layered repository
|
||||
|
||||
Neon repository implementation that keeps data in layers.
|
||||
Zenith repository implementation that keeps data in layers.
|
||||
### LSN
|
||||
|
||||
The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
|
||||
@@ -101,7 +101,7 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a
|
||||
Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
|
||||
Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.
|
||||
|
||||
In Postgres and Neon LSNs are used to describe certain points in WAL handling.
|
||||
In postgres and Zenith lsns are used to describe certain points in WAL handling.
|
||||
|
||||
PostgreSQL LSNs and functions to monitor them:
|
||||
* `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
|
||||
@@ -111,13 +111,13 @@ PostgreSQL LSNs and functions to monitor them:
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
|
||||
|
||||
Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
|
||||
Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
|
||||
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
|
||||
* `RestartLSN`: position in WAL confirmed by all safekeepers.
|
||||
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
|
||||
* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
|
||||
|
||||
Neon pageserver LSNs:
|
||||
Zenith pageserver LSNs:
|
||||
* `last_record_lsn` - the end of last processed WAL record.
|
||||
* `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
|
||||
* `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
|
||||
@@ -132,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver.
|
||||
|
||||
### Pageserver
|
||||
|
||||
Neon storage engine: repositories + wal receiver + page service + wal redo.
|
||||
Zenith storage engine: repositories + wal receiver + page service + wal redo.
|
||||
|
||||
### Page service
|
||||
|
||||
@@ -184,10 +184,10 @@ relation exceeds that size, it is split into multiple segments.
|
||||
SLRUs include pg_clog, pg_multixact/members, and
|
||||
pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
|
||||
they don't need to be stored permanently (e.g. pg_subtrans),
|
||||
or we do not support them in neon yet (pg_commit_ts).
|
||||
or we do not support them in zenith yet (pg_commit_ts).
|
||||
|
||||
### Tenant (Multitenancy)
|
||||
Tenant represents a single customer, interacting with Neon.
|
||||
Tenant represents a single customer, interacting with Zenith.
|
||||
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
|
||||
One pageserver[] can serve multiple tenants at once.
|
||||
One safekeeper
|
||||
|
||||
@@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in
|
||||
S3. WAL that has been archived to S3 can be removed from the
|
||||
safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
|
||||
```
|
||||
|
||||
+----------------+
|
||||
+-----> | WAL safekeeper |
|
||||
| +----------------+
|
||||
@@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
\
|
||||
\
|
||||
\
|
||||
\ +--------+
|
||||
\ | |
|
||||
+------> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
\ +--------+
|
||||
\ | |
|
||||
+--> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
|
||||
|
||||
```
|
||||
Every WAL safekeeper holds a section of WAL, and a VCL value.
|
||||
The WAL can be divided into three portions:
|
||||
|
||||
```
|
||||
|
||||
VCL LSN
|
||||
| |
|
||||
V V
|
||||
.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
|
||||
Archived WAL Completed WAL In-flight WAL
|
||||
```
|
||||
|
||||
|
||||
Note that all this WAL kept in a safekeeper is a contiguous section.
|
||||
This is different from Aurora: In Aurora, there can be holes in the
|
||||
|
||||
@@ -1,151 +0,0 @@
|
||||
# Dispatching a connection
|
||||
|
||||
For each client connection, Neon service needs to authenticate the
|
||||
connection, and route it to the right PostgreSQL instance.
|
||||
|
||||
## Authentication
|
||||
|
||||
There are three different ways to authenticate:
|
||||
|
||||
- anonymous; no authentication needed
|
||||
- PostgreSQL authentication
|
||||
- github single sign-on using browser
|
||||
|
||||
In anonymous access, the user doesn't need to perform any
|
||||
authentication at all. This can be used e.g. in interactive PostgreSQL
|
||||
documentation, allowing you to run the examples very quickly. Similar
|
||||
to sqlfiddle.com.
|
||||
|
||||
PostgreSQL authentication works the same as always. All the different
|
||||
PostgreSQL authentication options like SCRAM, kerberos, etc. are
|
||||
available. [1]
|
||||
|
||||
The third option is to authenticate with github single sign-on. When
|
||||
you open the connection in psql, you get a link that you open with
|
||||
your browser. Opening the link redirects you to github authentication,
|
||||
and lets the connection to proceed. This is also known as "Link auth" [2].
|
||||
|
||||
|
||||
## Routing the connection
|
||||
|
||||
When a client starts a connection, it needs to be routed to the
|
||||
correct PostgreSQL instance. Routing can be done by the proxy, acting
|
||||
as a man-in-the-middle, or the connection can be routed at the network
|
||||
level based on the hostname or IP address.
|
||||
|
||||
Either way, Neon needs to identify which PostgreSQL instance the
|
||||
connection should be routed to. If the instance is not already
|
||||
running, it needs to be started. Some connections always require a new
|
||||
PostgreSQL instance to be created, e.g. if you want to run a one-off
|
||||
query against a particular point-in-time.
|
||||
|
||||
The PostgreSQL instance is identified by:
|
||||
- Neon account (possibly anonymous)
|
||||
- cluster (known as tenant in the storage?)
|
||||
- branch or snapshot name
|
||||
- timestamp (PITR)
|
||||
- primary or read-replica
|
||||
- one-off read replica
|
||||
- one-off writeable branch
|
||||
|
||||
When you are using regular PostgreSQL authentication or anonymous
|
||||
access, the connection URL needs to contain all the information needed
|
||||
for the routing. With github single sign-on, the browser is involved
|
||||
and some details - the Neon account in particular - can be deduced
|
||||
from the authentication exchange.
|
||||
|
||||
There are three methods for identifying the PostgreSQL instance:
|
||||
|
||||
- Browser interaction (link auth)
|
||||
- Options in the connection URL and the domain name
|
||||
- A pre-defined endpoint, identified by domain name or IP address
|
||||
|
||||
### Link Auth
|
||||
|
||||
postgres://<username>@start.neon.tech/<dbname>
|
||||
|
||||
This gives you a link that you open in browser. Clicking the link
|
||||
performs github authentication, and the Neon account name is
|
||||
provided to the proxy behind the scenes. The proxy routes the
|
||||
connection to the primary PostgreSQL instance in cluster called
|
||||
"main", branch "main".
|
||||
|
||||
Further ideas:
|
||||
- You could pre-define a different target for link auth
|
||||
connections in the UI.
|
||||
- You could have a drop-down in the browser, allowing you to connect
|
||||
to any cluster you want. Link Auth can be like Teleport.
|
||||
|
||||
### Connection URL
|
||||
|
||||
The connection URL looks like this:
|
||||
|
||||
postgres://<username>@<cluster-id>.db.neon.tech/<dbname>
|
||||
|
||||
By default, this connects you to the primary PostgreSQL instance
|
||||
running on the "main" branch in the named cluster [3]. However, you can
|
||||
change that by specifying options in the connection URL. The following
|
||||
options are supported:
|
||||
|
||||
| option name | Description | Examples |
|
||||
| --- | --- | --- |
|
||||
| cluster | Cluster name | cluster:myproject |
|
||||
| branch | Branch name | branch:main |
|
||||
| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z |
|
||||
| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 |
|
||||
| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new |
|
||||
|
||||
For example, to read branch 'testing' as it was on Mar 31, 2022, you could
|
||||
specify a timestamp in the connection URL [4]:
|
||||
|
||||
postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31
|
||||
|
||||
Connecting with cluster name and options can be disabled in the UI. If
|
||||
disabled, you can only connect using a pre-defined endpoint.
|
||||
|
||||
### Pre-defined Endpoint
|
||||
|
||||
Instead of providing the cluster name, branch, and all those options
|
||||
in the connection URL, you can define a named endpoint with the same
|
||||
options.
|
||||
|
||||
In the UI, click "create endpoint". Fill in the details:
|
||||
|
||||
- Cluster name
|
||||
- Branch
|
||||
- timestamp or LSN
|
||||
- is this for the primary or for a read replica
|
||||
- etc.
|
||||
|
||||
When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect:
|
||||
|
||||
postgres://<username>@<endpoint-id>.endpoint.neon.tech/<dbname>
|
||||
|
||||
|
||||
An endpoint can be assigned a static or dynamic IP address, so that
|
||||
you can connect to it with clients that don't support TLS SNI. Maybe
|
||||
bypass the proxy altogether, but that ought to be invisible to the
|
||||
user.
|
||||
|
||||
You can limit the range of source IP addresses that are allowed to
|
||||
connect to an endpoint. An endpoint can also be exposed in an Amazon
|
||||
VPC, allowing direct connections from applications.
|
||||
|
||||
|
||||
# Footnotes
|
||||
|
||||
[1] I'm not sure how feasible it is to set up configure like Kerberos
|
||||
or LDAP in a cloud environment. But in principle I think we should
|
||||
allow customers to have the full power of PostgreSQL, including all
|
||||
authentication options. However, it's up to the customer to configure
|
||||
it correctly.
|
||||
|
||||
[2] Link is a way to both authenticate and to route the connection
|
||||
|
||||
[3] This assumes that cluster-ids are globally unique, across all
|
||||
Neon accounts.
|
||||
|
||||
[4] The syntax accepted in the connection URL is limited by libpq. The
|
||||
only way to pass arbitrary options to the server (or our proxy) is
|
||||
with the "options" keyword, and the options must be percent-encoded. I
|
||||
think the above would work but i haven't tested it
|
||||
@@ -6,6 +6,7 @@ If there's no such file during `init` phase of the server, it creates the file i
|
||||
There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override
|
||||
the values in the config file, if any are specified for the same key and get into the final config during init phase.
|
||||
|
||||
|
||||
### Config example
|
||||
|
||||
```toml
|
||||
@@ -25,22 +26,18 @@ max_file_descriptors = '100'
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
initial_superuser_name = 'zenith_admin'
|
||||
|
||||
broker_etcd_prefix = 'neon'
|
||||
broker_endpoints = ['some://etcd']
|
||||
|
||||
# [remote_storage]
|
||||
```
|
||||
|
||||
The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user,
|
||||
see the corresponding section below.
|
||||
The config above shows default values for all basic pageserver settings.
|
||||
Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
|
||||
Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
|
||||
|
||||
Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and
|
||||
|
||||
- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
|
||||
- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
|
||||
### Config values
|
||||
|
||||
@@ -50,17 +47,6 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage=
|
||||
|
||||
Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
|
||||
|
||||
#### broker_endpoints
|
||||
|
||||
A list of endpoints (etcd currently) to connect and pull the information from.
|
||||
Mandatory, does not have a default, since requires etcd to be started as a separate process,
|
||||
and its connection url should be specified separately.
|
||||
|
||||
#### broker_etcd_prefix
|
||||
|
||||
A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
|
||||
Default is `neon`.
|
||||
|
||||
#### checkpoint_distance
|
||||
|
||||
`checkpoint_distance` is the amount of incoming WAL that is held in
|
||||
@@ -71,7 +57,7 @@ but it will trigger a checkpoint operation to get it back below the
|
||||
limit.
|
||||
|
||||
`checkpoint_distance` also determines how much WAL needs to be kept
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
this much WAL, with some headroom, otherwise you can get stuck in a
|
||||
situation where the safekeeper is full and stops accepting new WAL,
|
||||
but the pageserver is not flushing out and releasing the space in the
|
||||
@@ -86,11 +72,7 @@ The unit is # of bytes.
|
||||
|
||||
Every `compaction_period` seconds, the page server checks if
|
||||
maintenance operations, like compaction, are needed on the layer
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### compaction_target_size
|
||||
|
||||
File sizes for L0 delta and L1 image layers. Default is 128MB.
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### gc_horizon
|
||||
|
||||
@@ -103,14 +85,6 @@ away.
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
|
||||
#### image_creation_threshold
|
||||
|
||||
L0 delta layer threshold for L1 iamge layer creation. Default is 3.
|
||||
|
||||
#### pitr_interval
|
||||
|
||||
WAL retention duration for PITR branching. Default is 30 days.
|
||||
|
||||
#### initial_superuser_name
|
||||
|
||||
Name of the initial superuser role, passed to initdb when a new tenant
|
||||
@@ -177,11 +151,12 @@ bucket_region = 'eu-north-1'
|
||||
# Optional, pageserver uses entire bucket if the prefix is not specified.
|
||||
prefix_in_bucket = '/some/prefix/'
|
||||
|
||||
# S3 API query limit to avoid getting errors/throttling from AWS.
|
||||
concurrency_limit = 100
|
||||
```
|
||||
# Access key to connect to the bucket ("login" part of the credentials)
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
|
||||
If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials.
|
||||
# Secret access key to connect to the bucket ("password" part of the credentials)
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
```
|
||||
|
||||
###### General remote storage configuration
|
||||
|
||||
@@ -192,13 +167,14 @@ Besides, there are parameters common for all types of remote storage that can be
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
# Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time.
|
||||
max_concurrent_syncs = 50
|
||||
# Max number of concurrent connections to open for uploading to or downloading from the remote storage.
|
||||
max_concurrent_sync = 100
|
||||
|
||||
# Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
|
||||
max_sync_errors = 10
|
||||
```
|
||||
|
||||
|
||||
## safekeeper
|
||||
|
||||
TODO
|
||||
|
||||
@@ -28,7 +28,12 @@ The pageserver has a few different duties:
|
||||
- Receive WAL from the WAL service and decode it.
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
|
||||
For more detailed info, see [/pageserver/README](/pageserver/README.md)
|
||||
For more detailed info, see `/pageserver/README`
|
||||
|
||||
`/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/proxy`:
|
||||
|
||||
@@ -57,7 +62,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
It acts as a holding area and redistribution center for recently generated WAL.
|
||||
|
||||
For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
|
||||
For more detailed info, see `/safekeeper/README`
|
||||
|
||||
`/workspace_hack`:
|
||||
The workspace_hack crate exists only to pin down some dependencies.
|
||||
@@ -69,21 +74,14 @@ We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation.
|
||||
Main entry point for the 'zenith' CLI utility.
|
||||
TODO: Doesn't it belong to control_plane?
|
||||
|
||||
`/libs`:
|
||||
Unites granular neon helper crates under the hood.
|
||||
`/zenith_metrics`:
|
||||
|
||||
`/libs/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/libs/utils`:
|
||||
Generic helpers that are shared between other crates in this repository.
|
||||
A subject for future modularization.
|
||||
|
||||
`/libs/metrics`:
|
||||
Helpers for exposing Prometheus metrics from the server.
|
||||
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
@@ -91,22 +89,18 @@ so manual installation of dependencies is not recommended.
|
||||
A single virtual environment with all dependencies is described in the single `Pipfile`.
|
||||
|
||||
### Prerequisites
|
||||
- Install Python 3.9 (the minimal supported version) or greater.
|
||||
- Install Python 3.7 (the minimal supported version) or greater.
|
||||
- Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected.
|
||||
- If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
|
||||
- If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.:
|
||||
```bash
|
||||
# In Ubuntu
|
||||
sudo add-apt-repository ppa:deadsnakes/ppa
|
||||
sudo apt update
|
||||
sudo apt install python3.9
|
||||
sudo apt install python3.7
|
||||
```
|
||||
- Install `poetry`
|
||||
- Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
|
||||
- Install dependencies via `./scripts/pysync`.
|
||||
- Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
|
||||
so if you have different version some linting tools can yield different result locally vs in the CI.
|
||||
- You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`.
|
||||
This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning.
|
||||
- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI.
|
||||
|
||||
Run `poetry shell` to activate the virtual environment.
|
||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
[package]
|
||||
name = "etcd_broker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
etcd-client = "0.9.0"
|
||||
regex = "1.4.5"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio = "1"
|
||||
tracing = "0.1"
|
||||
thiserror = "1"
|
||||
@@ -1,348 +0,0 @@
|
||||
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
||||
//! Intended to connect services to each other, not to store their data.
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
fmt::Display,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use regex::{Captures, Regex};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
pub use etcd_client::*;
|
||||
|
||||
use tokio::{sync::mpsc, task::JoinHandle};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZNodeId, ZTenantId, ZTenantTimelineId},
|
||||
};
|
||||
|
||||
/// Default value to use for prefixing to all etcd keys with.
|
||||
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
||||
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
struct SafekeeperTimeline {
|
||||
safekeeper_id: ZNodeId,
|
||||
info: SkTimelineInfo,
|
||||
}
|
||||
|
||||
/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub flush_lsn: Option<Lsn>,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub s3_wal_lsn: Option<Lsn>,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub peer_horizon_lsn: Option<Lsn>,
|
||||
#[serde(default)]
|
||||
pub safekeeper_connection_string: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BrokerError {
|
||||
#[error("Etcd client error: {0}. Context: {1}")]
|
||||
EtcdClient(etcd_client::Error, String),
|
||||
#[error("Error during parsing etcd data: {0}")]
|
||||
ParsingError(String),
|
||||
#[error("Internal error: {0}")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
/// A way to control the data retrieval from a certain subscription.
|
||||
pub struct SkTimelineSubscription {
|
||||
safekeeper_timeline_updates:
|
||||
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>>,
|
||||
kind: SkTimelineSubscriptionKind,
|
||||
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||
watcher: Watcher,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscription {
|
||||
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
||||
pub async fn fetch_data(
|
||||
&mut self,
|
||||
) -> Option<HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>>> {
|
||||
self.safekeeper_timeline_updates.recv().await
|
||||
}
|
||||
|
||||
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
||||
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
||||
self.watcher.cancel().await.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!(
|
||||
"Failed to cancel timeline subscription, kind: {:?}",
|
||||
self.kind
|
||||
),
|
||||
)
|
||||
})?;
|
||||
self.watcher_handle.await.map_err(|e| {
|
||||
BrokerError::InternalError(format!(
|
||||
"Failed to join the timeline updates task, kind: {:?}, error: {e}",
|
||||
self.kind
|
||||
))
|
||||
})?
|
||||
}
|
||||
}
|
||||
|
||||
/// The subscription kind to the timeline updates from safekeeper.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct SkTimelineSubscriptionKind {
|
||||
broker_etcd_prefix: String,
|
||||
kind: SubscriptionKind,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscriptionKind {
|
||||
pub fn all(broker_etcd_prefix: String) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::All,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::Tenant(tenant),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::Timeline(timeline),
|
||||
}
|
||||
}
|
||||
|
||||
fn watch_regex(&self) -> Regex {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => Regex::new(&format!(
|
||||
r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'everything' subscription"),
|
||||
SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'tenant' subscription"),
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'timeline' subscription"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
||||
pub fn watch_key(&self) -> String {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => self.broker_etcd_prefix.to_string(),
|
||||
SubscriptionKind::Tenant(tenant_id) => {
|
||||
format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix)
|
||||
}
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => format!(
|
||||
"{}/{tenant_id}/{timeline_id}/safekeeper",
|
||||
self.broker_etcd_prefix
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
enum SubscriptionKind {
|
||||
/// Get every timeline update.
|
||||
All,
|
||||
/// Get certain tenant timelines' updates.
|
||||
Tenant(ZTenantId),
|
||||
/// Get certain timeline updates.
|
||||
Timeline(ZTenantTimelineId),
|
||||
}
|
||||
|
||||
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
||||
/// Stops and returns `Err` on any error during etcd communication.
|
||||
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
||||
/// exiting normally in such cases.
|
||||
pub async fn subscribe_to_safekeeper_timeline_updates(
|
||||
client: &mut Client,
|
||||
subscription: SkTimelineSubscriptionKind,
|
||||
) -> Result<SkTimelineSubscription, BrokerError> {
|
||||
info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
|
||||
|
||||
let (watcher, mut stream) = client
|
||||
.watch(
|
||||
subscription.watch_key(),
|
||||
Some(WatchOptions::new().with_prefix()),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!("Failed to init the watch for subscription {subscription:?}"),
|
||||
)
|
||||
})?;
|
||||
|
||||
let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
|
||||
|
||||
let subscription_kind = subscription.kind;
|
||||
let regex = subscription.watch_regex();
|
||||
let watcher_handle = tokio::spawn(async move {
|
||||
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
||||
"Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}"
|
||||
)))? {
|
||||
if resp.canceled() {
|
||||
info!("Watch for timeline updates subscription was canceled, exiting");
|
||||
break;
|
||||
}
|
||||
|
||||
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<ZNodeId, SkTimelineInfo>> = HashMap::new();
|
||||
// Keep track that the timeline data updates from etcd arrive in the right order.
|
||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
||||
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
||||
let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
|
||||
|
||||
|
||||
let events = resp.events();
|
||||
debug!("Processing {} events", events.len());
|
||||
|
||||
for event in events {
|
||||
if EventType::Put == event.event_type() {
|
||||
if let Some(new_etcd_kv) = event.kv() {
|
||||
let new_kv_version = new_etcd_kv.version();
|
||||
|
||||
match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) {
|
||||
Ok(Some((zttid, timeline))) => {
|
||||
match timeline_updates
|
||||
.entry(zttid)
|
||||
.or_default()
|
||||
.entry(timeline.safekeeper_id)
|
||||
{
|
||||
hash_map::Entry::Occupied(mut o) => {
|
||||
let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
|
||||
if old_etcd_kv_version < new_kv_version {
|
||||
o.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => error!("Failed to parse timeline update: {e}"),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = timeline_updates_sender.send(timeline_updates) {
|
||||
info!("Timeline updates sender got dropped, exiting: {e}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
});
|
||||
|
||||
Ok(SkTimelineSubscription {
|
||||
kind: subscription,
|
||||
safekeeper_timeline_updates,
|
||||
watcher_handle,
|
||||
watcher,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_etcd_key_value(
|
||||
subscription_kind: SubscriptionKind,
|
||||
regex: &Regex,
|
||||
kv: &KeyValue,
|
||||
) -> Result<Option<(ZTenantTimelineId, SafekeeperTimeline)>, BrokerError> {
|
||||
let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str"))
|
||||
})?) {
|
||||
caps
|
||||
} else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let (zttid, safekeeper_id) = match subscription_kind {
|
||||
SubscriptionKind::All => (
|
||||
ZTenantTimelineId::new(
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
ZNodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Tenant(tenant_id) => (
|
||||
ZTenantTimelineId::new(
|
||||
tenant_id,
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
ZNodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Timeline(zttid) => (
|
||||
zttid,
|
||||
ZNodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
};
|
||||
|
||||
let info_str = kv.value_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str"))
|
||||
})?;
|
||||
Ok(Some((
|
||||
zttid,
|
||||
SafekeeperTimeline {
|
||||
safekeeper_id,
|
||||
info: serde_json::from_str(info_str).map_err(|e| {
|
||||
BrokerError::ParsingError(format!(
|
||||
"Failed to parse '{info_str}' as safekeeper timeline info: {e}"
|
||||
))
|
||||
})?,
|
||||
},
|
||||
)))
|
||||
}
|
||||
|
||||
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: Display,
|
||||
{
|
||||
let capture_match = caps
|
||||
.get(index)
|
||||
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
||||
.as_str();
|
||||
capture_match.parse().map_err(|e| {
|
||||
format!(
|
||||
"Failed to parse {} from {capture_match}: {e}",
|
||||
std::any::type_name::<T>()
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
[package]
|
||||
name = "metrics"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
lazy_static = "1.4"
|
||||
once_cell = "1.8.0"
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -1,14 +0,0 @@
|
||||
[package]
|
||||
name = "wal_generate"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
log = "0.4"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tempfile = "3.2"
|
||||
@@ -1,58 +0,0 @@
|
||||
use anyhow::*;
|
||||
use clap::{App, Arg};
|
||||
use wal_generate::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
env_logger::Builder::from_env(
|
||||
env_logger::Env::default().default_filter_or("wal_generate=info"),
|
||||
)
|
||||
.init();
|
||||
let arg_matches = App::new("Postgres WAL generator")
|
||||
.about("Generates Postgres databases with specific WAL properties")
|
||||
.arg(
|
||||
Arg::new("datadir")
|
||||
.short('D')
|
||||
.long("datadir")
|
||||
.takes_value(true)
|
||||
.help("Data directory for the Postgres server")
|
||||
.required(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-distrib-dir")
|
||||
.long("pg-distrib-dir")
|
||||
.takes_value(true)
|
||||
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
|
||||
.default_value("/usr/local")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("type")
|
||||
.long("type")
|
||||
.takes_value(true)
|
||||
.help("Type of WAL to generate")
|
||||
.possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
|
||||
.required(true)
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let cfg = Conf {
|
||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
||||
};
|
||||
cfg.initdb()?;
|
||||
let mut srv = cfg.start_server()?;
|
||||
let lsn = match arg_matches.value_of("type").unwrap() {
|
||||
"simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
|
||||
"last_wal_record_crossing_segment" => {
|
||||
generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
|
||||
}
|
||||
"wal_record_crossing_segment_followed_by_small_one" => {
|
||||
generate_wal_record_crossing_segment_followed_by_small_one(
|
||||
&mut srv.connect_with_timeout()?,
|
||||
)?
|
||||
}
|
||||
a => panic!("Unknown --type argument: {}", a),
|
||||
};
|
||||
println!("end_of_wal = {}", lsn);
|
||||
srv.kill();
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,278 +0,0 @@
|
||||
use anyhow::*;
|
||||
use core::time::Duration;
|
||||
use log::*;
|
||||
use postgres::types::PgLsn;
|
||||
use postgres::Client;
|
||||
use std::cmp::Ordering;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::Instant;
|
||||
use tempfile::{tempdir, TempDir};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Conf {
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
pub datadir: PathBuf,
|
||||
}
|
||||
|
||||
pub struct PostgresServer {
|
||||
process: std::process::Child,
|
||||
_unix_socket_dir: TempDir,
|
||||
client_config: postgres::Config,
|
||||
}
|
||||
|
||||
impl Conf {
|
||||
fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
|
||||
fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
|
||||
fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
|
||||
let path = self.pg_bin_dir().join(command);
|
||||
ensure!(path.exists(), "Command {:?} does not exist", path);
|
||||
let mut cmd = Command::new(path);
|
||||
cmd.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.pg_lib_dir())
|
||||
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
|
||||
Ok(cmd)
|
||||
}
|
||||
|
||||
pub fn initdb(&self) -> Result<()> {
|
||||
if let Some(parent) = self.datadir.parent() {
|
||||
info!("Pre-creating parent directory {:?}", parent);
|
||||
// Tests may be run concurrently and there may be a race to create `test_output/`.
|
||||
// std::fs::create_dir_all is guaranteed to have no races with another thread creating directories.
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
info!(
|
||||
"Running initdb in {:?} with user \"postgres\"",
|
||||
self.datadir
|
||||
);
|
||||
let output = self
|
||||
.new_pg_command("initdb")?
|
||||
.arg("-D")
|
||||
.arg(self.datadir.as_os_str())
|
||||
.args(&["-U", "postgres", "--no-instructions", "--no-sync"])
|
||||
.output()?;
|
||||
debug!("initdb output: {:?}", output);
|
||||
ensure!(
|
||||
output.status.success(),
|
||||
"initdb failed, stdout and stderr follow:\n{}{}",
|
||||
String::from_utf8_lossy(&output.stdout),
|
||||
String::from_utf8_lossy(&output.stderr),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start_server(&self) -> Result<PostgresServer> {
|
||||
info!("Starting Postgres server in {:?}", self.datadir);
|
||||
let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
|
||||
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
|
||||
let server_process = self
|
||||
.new_pg_command("postgres")?
|
||||
.args(&["-c", "listen_addresses="])
|
||||
.arg("-k")
|
||||
.arg(unix_socket_dir_path.as_os_str())
|
||||
.arg("-D")
|
||||
.arg(self.datadir.as_os_str())
|
||||
.args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
|
||||
.args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
|
||||
.args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup
|
||||
// Disable background processes as much as possible
|
||||
.args(&["-c", "wal_writer_delay=10s"])
|
||||
.args(&["-c", "autovacuum=off"])
|
||||
.stderr(Stdio::null())
|
||||
.spawn()?;
|
||||
let server = PostgresServer {
|
||||
process: server_process,
|
||||
_unix_socket_dir: unix_socket_dir,
|
||||
client_config: {
|
||||
let mut c = postgres::Config::new();
|
||||
c.host_path(&unix_socket_dir_path);
|
||||
c.user("postgres");
|
||||
c.connect_timeout(Duration::from_millis(1000));
|
||||
c
|
||||
},
|
||||
};
|
||||
Ok(server)
|
||||
}
|
||||
|
||||
pub fn pg_waldump(
|
||||
&self,
|
||||
first_segment_name: &str,
|
||||
last_segment_name: &str,
|
||||
) -> Result<std::process::Output> {
|
||||
let first_segment_file = self.datadir.join(first_segment_name);
|
||||
let last_segment_file = self.datadir.join(last_segment_name);
|
||||
info!(
|
||||
"Running pg_waldump for {} .. {}",
|
||||
first_segment_file.display(),
|
||||
last_segment_file.display()
|
||||
);
|
||||
let output = self
|
||||
.new_pg_command("pg_waldump")?
|
||||
.args(&[
|
||||
&first_segment_file.as_os_str(),
|
||||
&last_segment_file.as_os_str(),
|
||||
])
|
||||
.output()?;
|
||||
debug!("waldump output: {:?}", output);
|
||||
Ok(output)
|
||||
}
|
||||
}
|
||||
|
||||
impl PostgresServer {
|
||||
pub fn connect_with_timeout(&self) -> Result<Client> {
|
||||
let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap();
|
||||
while Instant::now() < retry_until {
|
||||
use std::result::Result::Ok;
|
||||
if let Ok(client) = self.client_config.connect(postgres::NoTls) {
|
||||
return Ok(client);
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
bail!("Connection timed out");
|
||||
}
|
||||
|
||||
pub fn kill(&mut self) {
|
||||
self.process.kill().unwrap();
|
||||
self.process.wait().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresServer {
|
||||
fn drop(&mut self) {
|
||||
use std::result::Result::Ok;
|
||||
match self.process.try_wait() {
|
||||
Ok(Some(_)) => return,
|
||||
Ok(None) => {
|
||||
warn!("Server was not terminated, will be killed");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Unable to get status of the server: {}, will be killed", e);
|
||||
}
|
||||
}
|
||||
let _ = self.process.kill();
|
||||
}
|
||||
}
|
||||
|
||||
pub trait PostgresClientExt: postgres::GenericClient {
|
||||
fn pg_current_wal_insert_lsn(&mut self) -> Result<PgLsn> {
|
||||
Ok(self
|
||||
.query_one("SELECT pg_current_wal_insert_lsn()", &[])?
|
||||
.get(0))
|
||||
}
|
||||
fn pg_current_wal_flush_lsn(&mut self) -> Result<PgLsn> {
|
||||
Ok(self
|
||||
.query_one("SELECT pg_current_wal_flush_lsn()", &[])?
|
||||
.get(0))
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: postgres::GenericClient> PostgresClientExt for C {}
|
||||
|
||||
fn generate_internal<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
|
||||
) -> Result<PgLsn> {
|
||||
client.execute("create extension if not exists zenith_test_utils", &[])?;
|
||||
|
||||
let wal_segment_size = client.query_one(
|
||||
"select cast(setting as bigint) as setting, unit \
|
||||
from pg_settings where name = 'wal_segment_size'",
|
||||
&[],
|
||||
)?;
|
||||
ensure!(
|
||||
wal_segment_size.get::<_, String>("unit") == "B",
|
||||
"Unexpected wal_segment_size unit"
|
||||
);
|
||||
ensure!(
|
||||
wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024,
|
||||
"Unexpected wal_segment_size in bytes"
|
||||
);
|
||||
|
||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
info!("LSN initial = {}", initial_lsn);
|
||||
|
||||
let last_lsn = match f(client, initial_lsn)? {
|
||||
None => client.pg_current_wal_insert_lsn()?,
|
||||
Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were inserted after the generated WAL"),
|
||||
Ordering::Equal => last_lsn,
|
||||
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
||||
},
|
||||
};
|
||||
|
||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were flushed after the generated WAL"),
|
||||
Ordering::Equal => {}
|
||||
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
||||
}
|
||||
Ok(last_lsn)
|
||||
}
|
||||
|
||||
pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
generate_internal(client, |client, _| {
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
Ok(None)
|
||||
})
|
||||
}
|
||||
|
||||
fn generate_single_logical_message(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
transactional: bool,
|
||||
) -> Result<PgLsn> {
|
||||
generate_internal(client, |client, initial_lsn| {
|
||||
ensure!(
|
||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||
"Initial LSN is too far in the future"
|
||||
);
|
||||
|
||||
let message_lsn: PgLsn = client
|
||||
.query_one(
|
||||
"select pg_logical_emit_message($1, 'big-16mb-msg', \
|
||||
concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn",
|
||||
&[&transactional],
|
||||
)?
|
||||
.get("message_lsn");
|
||||
ensure!(
|
||||
message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192),
|
||||
"Logical message did not cross the segment boundary"
|
||||
);
|
||||
ensure!(
|
||||
message_lsn < PgLsn::from(0x0400_0000),
|
||||
"Logical message crossed two segments"
|
||||
);
|
||||
|
||||
if transactional {
|
||||
// Transactional logical messages are part of a transaction, so the one above is
|
||||
// followed by a small COMMIT record.
|
||||
|
||||
let after_message_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
ensure!(
|
||||
message_lsn < after_message_lsn,
|
||||
"No record found after the emitted message"
|
||||
);
|
||||
Ok(Some(after_message_lsn))
|
||||
} else {
|
||||
Ok(Some(message_lsn))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn generate_wal_record_crossing_segment_followed_by_small_one(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
) -> Result<PgLsn> {
|
||||
generate_single_logical_message(client, true)
|
||||
}
|
||||
|
||||
pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
) -> Result<PgLsn> {
|
||||
generate_single_logical_message(client, false)
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
[package]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
tracing = "0.1.27"
|
||||
rusoto_core = "0.48"
|
||||
rusoto_s3 = "0.48"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
async-trait = "0.1"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2"
|
||||
@@ -1,233 +0,0 @@
|
||||
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||
//!
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
ffi::OsStr,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::io;
|
||||
use tracing::info;
|
||||
|
||||
pub use self::{
|
||||
local_fs::LocalFs,
|
||||
s3_bucket::{S3Bucket, S3ObjectKey},
|
||||
};
|
||||
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type RemoteObjectId;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result<PathBuf>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
pub enum GenericRemoteStorage {
|
||||
Local(LocalFs),
|
||||
S3(S3Bucket),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn new(
|
||||
working_directory: PathBuf,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
) -> anyhow::Result<Self> {
|
||||
match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{}' as a remote storage", root.display());
|
||||
LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
|
||||
/// Immutable, cannot be changed once the file is created.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
|
||||
pub max_concurrent_syncs: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
|
||||
let new_extension = match original_path
|
||||
.as_ref()
|
||||
.extension()
|
||||
.map(OsStr::to_string_lossy)
|
||||
{
|
||||
Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
|
||||
None => Cow::Borrowed(suffix),
|
||||
};
|
||||
original_path
|
||||
.as_ref()
|
||||
.with_extension(new_extension.as_ref())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_path_with_suffix_extension() {
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp").to_string_lossy(),
|
||||
"/foo/bar.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.baz.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar.baz..temp"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,105 +0,0 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
#![allow(clippy::manual_range_contains)]
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
pub mod seqwait;
|
||||
|
||||
/// append only ordered map implemented with a Vec
|
||||
pub mod vec_map;
|
||||
|
||||
// Async version of SeqWait. Currently unused.
|
||||
// pub mod seqwait_async;
|
||||
|
||||
pub mod bin_ser;
|
||||
pub mod postgres_backend;
|
||||
pub mod pq_proto;
|
||||
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
// helper functions for creating and fsyncing directories/trees
|
||||
pub mod crashsafe_dir;
|
||||
|
||||
// common authentication routines
|
||||
pub mod auth;
|
||||
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod zid;
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
// socket splitting utils
|
||||
pub mod sock_split;
|
||||
|
||||
// common log initialisation routine
|
||||
pub mod logging;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
pub mod shutdown;
|
||||
|
||||
// Tools for calling certain async methods in sync contexts
|
||||
pub mod sync;
|
||||
|
||||
// Utility for binding TcpListeners with proper socket options.
|
||||
pub mod tcp_listener;
|
||||
|
||||
// Utility for putting a raw file descriptor into non-blocking mode
|
||||
pub mod nonblock;
|
||||
|
||||
// Default signal handling
|
||||
pub mod signals;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
/// * building locally from git repo
|
||||
/// * building in CI from git repo
|
||||
/// * building in docker (either in CI or locally)
|
||||
///
|
||||
/// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
||||
/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
|
||||
/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
|
||||
/// Git version received from environment variable used as a fallback in git_version invokation.
|
||||
/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
/// So the build script will be run only when GIT_VERSION envvar has changed.
|
||||
///
|
||||
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
||||
/// Caching and workspaces complicates that. In case `utils` is not
|
||||
/// recompiled due to caching then version may become outdated.
|
||||
/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
|
||||
/// so if we changed the index state git_version will pick that up and rerun the macro.
|
||||
///
|
||||
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||
///
|
||||
/// #############################################################################################
|
||||
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
|
||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||
#[macro_export]
|
||||
macro_rules! project_git_version {
|
||||
($const_identifier:ident) => {
|
||||
const $const_identifier: &str = git_version::git_version!(
|
||||
prefix = "git:",
|
||||
fallback = concat!(
|
||||
"git-env:",
|
||||
env!("GIT_VERSION", "Missing GIT_VERSION envvar")
|
||||
),
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
|
||||
#[macro_export]
|
||||
macro_rules! const_assert {
|
||||
($($args:tt)*) => {
|
||||
const _: () = assert!($($args)*);
|
||||
};
|
||||
}
|
||||
25
monitoring/docker-compose.yml
Normal file
25
monitoring/docker-compose.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
version: "3"
|
||||
services:
|
||||
|
||||
prometheus:
|
||||
container_name: prometheus
|
||||
image: prom/prometheus:latest
|
||||
volumes:
|
||||
- ./prometheus.yaml:/etc/prometheus/prometheus.yml
|
||||
# ports:
|
||||
# - "9090:9090"
|
||||
# TODO: find a proper portable solution
|
||||
network_mode: "host"
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
volumes:
|
||||
- ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
|
||||
environment:
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
|
||||
- GF_AUTH_DISABLE_LOGIN_FORM=true
|
||||
# ports:
|
||||
# - "3000:3000"
|
||||
# TODO: find a proper portable solution
|
||||
network_mode: "host"
|
||||
12
monitoring/grafana.yaml
Normal file
12
monitoring/grafana.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
orgId: 1
|
||||
url: http://localhost:9090
|
||||
basicAuth: false
|
||||
isDefault: false
|
||||
version: 1
|
||||
editable: false
|
||||
5
monitoring/prometheus.yaml
Normal file
5
monitoring/prometheus.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
scrape_configs:
|
||||
- job_name: 'default'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['localhost:9898']
|
||||
@@ -3,14 +3,6 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[features]
|
||||
# It is simpler infra-wise to have failpoints enabled by default
|
||||
# It shouldn't affect perf in any way because failpoints
|
||||
# are not placed in hot code paths
|
||||
default = ["failpoints"]
|
||||
profiling = ["pprof"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
@@ -25,10 +17,11 @@ lazy_static = "1.4.0"
|
||||
clap = "3.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
tokio-stream = "0.1.8"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
crc32c = "0.6.0"
|
||||
@@ -38,9 +31,6 @@ humantime = "2.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
humantime-serde = "1.1.1"
|
||||
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
scopeguard = "1.1.0"
|
||||
@@ -52,16 +42,16 @@ nix = "0.23"
|
||||
once_cell = "1.8.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
rusoto_core = "0.47"
|
||||
rusoto_s3 = "0.47"
|
||||
async-trait = "0.1"
|
||||
async-compression = {version = "0.3", features = ["zstd", "tokio"]}
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
close_fds = "0.3.2"
|
||||
walkdir = "2.3.2"
|
||||
|
||||
[dev-dependencies]
|
||||
hex-literal = "0.3"
|
||||
|
||||
@@ -135,7 +135,7 @@ The backup service is disabled by default and can be enabled to interact with a
|
||||
|
||||
CLI examples:
|
||||
* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
|
||||
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
|
||||
* AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
@@ -155,9 +155,11 @@ or
|
||||
bucket_name = 'some-sample-bucket'
|
||||
bucket_region = 'eu-north-1'
|
||||
prefix_in_bucket = '/test_prefix/'
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
```
|
||||
|
||||
`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.
|
||||
Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
|
||||
|
||||
TODO: Sharding
|
||||
--------------------
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, ensure, Context, Result};
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
@@ -25,7 +25,7 @@ use crate::repository::Timeline;
|
||||
use crate::DatadirTimelineImpl;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
@@ -97,9 +97,7 @@ impl<'a> Basebackup<'a> {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||
// TODO include checksum
|
||||
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
// Create pgdata subdirs structure
|
||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
@@ -156,17 +154,9 @@ impl<'a> Basebackup<'a> {
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
|
||||
ensure!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(
|
||||
img.len() == pg_constants::BLCKSZ as usize
|
||||
|| img.len() == pg_constants::BLCKSZ as usize + 8
|
||||
);
|
||||
} else {
|
||||
ensure!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
}
|
||||
|
||||
slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]);
|
||||
slru_buf.extend_from_slice(&img);
|
||||
}
|
||||
|
||||
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
|
||||
@@ -325,8 +315,7 @@ impl<'a> Basebackup<'a> {
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
|
||||
ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
|
||||
@@ -7,9 +7,7 @@ use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use pageserver::page_cache;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
|
||||
@@ -2,45 +2,38 @@
|
||||
|
||||
use std::{env, path::Path, str::FromStr};
|
||||
use tracing::*;
|
||||
use zenith_utils::{
|
||||
auth::JwtAuth,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
tcp_listener,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
GIT_VERSION,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use fail::FailScenario;
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, tenant_mgr, thread_mgr,
|
||||
http, page_cache, page_service,
|
||||
remote_storage::{self, SyncStartupData},
|
||||
repository::{Repository, TimelineSyncStatusUpdate},
|
||||
tenant_mgr, thread_mgr,
|
||||
thread_mgr::ThreadKind,
|
||||
timelines, virtual_file, LOG_FILE_NAME,
|
||||
};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::endpoint,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
shutdown::exit_now,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn version() -> String {
|
||||
format!(
|
||||
"{GIT_VERSION} profiling:{} failpoints:{}",
|
||||
cfg!(feature = "profiling"),
|
||||
fail::has_failpoints()
|
||||
)
|
||||
}
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::shutdown::exit_now;
|
||||
use zenith_utils::signals::{self, Signal};
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(&*version())
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
@@ -85,25 +78,8 @@ fn main() -> anyhow::Result<()> {
|
||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
|
||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enabled-features")
|
||||
.long("enabled-features")
|
||||
.takes_value(false)
|
||||
.help("Show enabled compile time features"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
if arg_matches.is_present("enabled-features") {
|
||||
let features: &[&str] = &[
|
||||
#[cfg(feature = "failpoints")]
|
||||
"failpoints",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
println!("{{\"features\": {features:?} }}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let workdir = workdir
|
||||
.canonicalize()
|
||||
@@ -184,9 +160,6 @@ fn main() -> anyhow::Result<()> {
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = FailScenario::setup();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
@@ -202,19 +175,17 @@ fn main() -> anyhow::Result<()> {
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")
|
||||
}
|
||||
|
||||
scenario.teardown();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
|
||||
|
||||
info!("version: {GIT_VERSION}");
|
||||
info!("version: {}", GIT_VERSION);
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -254,14 +225,52 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => bail!("{err}. could not daemonize. bailing."),
|
||||
Err(err) => error!(%err, "could not daemonize"),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
// Initialize repositories with locally available timelines.
|
||||
// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
// are scheduled for download and added to the repository once download is completed.
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = remote_storage::start_local_timeline_sync(conf)
|
||||
.context("Failed to set up local files sync with external storage")?;
|
||||
|
||||
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
|
||||
// initialize local tenant
|
||||
let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index);
|
||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||
match init_status {
|
||||
remote_storage::LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {} for tenant {} is locally complete, registering it in repository", tenant_id, timeline_id);
|
||||
// Lets fail here loudly to be on the safe side.
|
||||
// XXX: It may be a better api to actually distinguish between repository startup
|
||||
// and processing of newly downloaded timelines.
|
||||
repo.apply_timeline_remote_sync_status_update(
|
||||
timeline_id,
|
||||
TimelineSyncStatusUpdate::Downloaded,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to bootstrap timeline {} for tenant {}",
|
||||
timeline_id, tenant_id
|
||||
)
|
||||
})?
|
||||
}
|
||||
remote_storage::LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
"timeline {} for tenant {} needs sync, \
|
||||
so skipped for adding into repository until sync is finished",
|
||||
tenant_id, timeline_id
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
@@ -274,8 +283,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
let remote_index = tenant_mgr::init_tenant_mgr(conf)?;
|
||||
|
||||
// Spawn a new thread for the http endpoint
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
let auth_cloned = auth.clone();
|
||||
@@ -284,9 +291,9 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
None,
|
||||
None,
|
||||
"http_endpoint_thread",
|
||||
true,
|
||||
false,
|
||||
move || {
|
||||
let router = http::make_router(conf, auth_cloned, remote_index)?;
|
||||
let router = http::make_router(conf, auth_cloned, remote_index);
|
||||
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
|
||||
},
|
||||
)?;
|
||||
@@ -298,7 +305,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
None,
|
||||
None,
|
||||
"libpq endpoint thread",
|
||||
true,
|
||||
false,
|
||||
move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
|
||||
)?;
|
||||
|
||||
@@ -308,7 +315,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
std::process::exit(111);
|
||||
}
|
||||
|
||||
@@ -317,8 +323,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
pageserver::shutdown_pageserver(0);
|
||||
pageserver::shutdown_pageserver();
|
||||
unreachable!()
|
||||
}
|
||||
})
|
||||
|
||||
334
pageserver/src/bin/pageserver_zst.rs
Normal file
334
pageserver/src/bin/pageserver_zst.rs
Normal file
@@ -0,0 +1,334 @@
|
||||
//! A CLI helper to deal with remote storage (S3, usually) blobs as archives.
|
||||
//! See [`compression`] for more details about the archives.
|
||||
|
||||
use std::{collections::BTreeSet, path::Path};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use clap::{App, Arg};
|
||||
use pageserver::{
|
||||
layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
remote_storage::compression,
|
||||
};
|
||||
use tokio::{fs, io};
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
const LIST_SUBCOMMAND: &str = "list";
|
||||
const ARCHIVE_ARG_NAME: &str = "archive";
|
||||
|
||||
const EXTRACT_SUBCOMMAND: &str = "extract";
|
||||
const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory";
|
||||
|
||||
const CREATE_SUBCOMMAND: &str = "create";
|
||||
const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory";
|
||||
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("pageserver zst blob [un]compressor utility")
|
||||
.version(GIT_VERSION)
|
||||
.subcommands(vec![
|
||||
App::new(LIST_SUBCOMMAND)
|
||||
.about("List the archive contents")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to list the contents of"),
|
||||
),
|
||||
App::new(EXTRACT_SUBCOMMAND)
|
||||
.about("Extracts the archive into the directory")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to extract"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to extract the archive into. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
App::new(CREATE_SUBCOMMAND)
|
||||
.about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)")
|
||||
.arg(
|
||||
Arg::new(SOURCE_DIRECTORY_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("A directory to use for creating the archive"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to create the archive in. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
])
|
||||
.get_matches();
|
||||
|
||||
let subcommand_name = match arg_matches.subcommand_name() {
|
||||
Some(name) => name,
|
||||
None => bail!("No subcommand specified"),
|
||||
};
|
||||
|
||||
let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) {
|
||||
Some(matches) => matches,
|
||||
None => bail!(
|
||||
"No subcommand arguments were recognized for subcommand '{}'",
|
||||
subcommand_name
|
||||
),
|
||||
};
|
||||
|
||||
let target_dir = Path::new(
|
||||
subcommand_matches
|
||||
.value_of(TARGET_DIRECTORY_ARG_NAME)
|
||||
.unwrap_or("./"),
|
||||
);
|
||||
|
||||
match subcommand_name {
|
||||
LIST_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
list_archive(archive).await
|
||||
}
|
||||
EXTRACT_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
extract_archive(archive, target_dir).await
|
||||
}
|
||||
CREATE_SUBCOMMAND => {
|
||||
let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) {
|
||||
Some(source) => Path::new(source),
|
||||
None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME),
|
||||
};
|
||||
create_archive(source_dir, target_dir).await
|
||||
}
|
||||
unknown => bail!("Unknown subcommand {}", unknown),
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_archive(archive: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
println!("Listing an archive at path '{}'", archive.display());
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
let archive_bytes = fs::read(&archive)
|
||||
.await
|
||||
.context("Failed to read the archive bytes")?;
|
||||
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice())
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
|
||||
let empty_path = Path::new("");
|
||||
println!("-------------------------------");
|
||||
|
||||
let longest_path_in_archive = header
|
||||
.files
|
||||
.iter()
|
||||
.filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len()))
|
||||
.max()
|
||||
.unwrap_or_default()
|
||||
.max(METADATA_FILE_NAME.len());
|
||||
|
||||
for regular_file in &header.files {
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
regular_file.subpath.as_path(empty_path).display(),
|
||||
regular_file.size,
|
||||
width = longest_path_in_archive,
|
||||
)
|
||||
}
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
METADATA_FILE_NAME,
|
||||
header.metadata_file_size,
|
||||
width = longest_path_in_archive,
|
||||
);
|
||||
println!("-------------------------------");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
let mut dir_contents = fs::read_dir(&target_dir)
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
let dir_entry = dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
ensure!(
|
||||
dir_entry.is_none(),
|
||||
"Target directory '{}' is not empty",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Extracting an archive at path '{}' into directory '{}'",
|
||||
archive.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut archive_file = fs::File::open(&archive).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to extract the archive")
|
||||
}
|
||||
|
||||
async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let source_dir = source_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the source dir path '{}'",
|
||||
source_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
source_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
source_dir.display()
|
||||
);
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Compressing directory '{}' and creating resulting archive in directory '{}'",
|
||||
source_dir.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut metadata_file_contents = None;
|
||||
let mut files_co_archive = Vec::new();
|
||||
|
||||
let mut source_dir_contents = fs::read_dir(&source_dir)
|
||||
.await
|
||||
.context("Failed to read the source directory contents")?;
|
||||
|
||||
while let Some(source_dir_entry) = source_dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to read a source dir entry")?
|
||||
{
|
||||
let entry_path = source_dir_entry.path();
|
||||
if entry_path.is_file() {
|
||||
if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) {
|
||||
let metadata_bytes = fs::read(entry_path)
|
||||
.await
|
||||
.context("Failed to read metata file bytes in the source dir")?;
|
||||
metadata_file_contents = Some(
|
||||
TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.context("Failed to parse metata file contents in the source dir")?,
|
||||
);
|
||||
} else {
|
||||
files_co_archive.push(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let metadata = match metadata_file_contents {
|
||||
Some(metadata) => metadata,
|
||||
None => bail!(
|
||||
"No metadata file found in the source dir '{}', cannot create the archive",
|
||||
source_dir.display()
|
||||
),
|
||||
};
|
||||
|
||||
let _ = compression::archive_files_as_stream(
|
||||
&source_dir,
|
||||
files_co_archive.iter(),
|
||||
&metadata,
|
||||
move |mut archive_streamer, archive_name| async move {
|
||||
let archive_target = target_dir.join(&archive_name);
|
||||
let mut archive_file = fs::File::create(&archive_target).await?;
|
||||
io::copy(&mut archive_streamer, &mut archive_file).await?;
|
||||
Ok(archive_target)
|
||||
},
|
||||
)
|
||||
.await
|
||||
.context("Failed to create an archive")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6,9 +6,8 @@ use clap::{App, Arg};
|
||||
use pageserver::layered_repository::metadata::TimelineMetadata;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith update metadata utility")
|
||||
|
||||
@@ -4,26 +4,22 @@
|
||||
//! file, or on the command line.
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
||||
|
||||
use std::convert::TryInto;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use url::Url;
|
||||
use utils::{
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::layered_repository::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant_config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
@@ -31,10 +27,27 @@ pub mod defaults {
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
|
||||
pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: &str = "100 s";
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 10;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
@@ -49,15 +62,6 @@ pub mod defaults {
|
||||
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
|
||||
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
|
||||
|
||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||
|
||||
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
||||
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
@@ -65,8 +69,14 @@ pub mod defaults {
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
|
||||
#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
|
||||
|
||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||
|
||||
#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
|
||||
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
# [remote_storage]
|
||||
|
||||
@@ -85,6 +95,25 @@ pub struct PageServerConf {
|
||||
/// Example (default): 127.0.0.1:9898
|
||||
pub listen_http_addr: String,
|
||||
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub checkpoint_distance: u64,
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub compaction_target_size: u64,
|
||||
|
||||
// How often to check if there's compaction work to be done.
|
||||
pub compaction_period: Duration,
|
||||
|
||||
// Level0 delta layer threshold for compaction.
|
||||
pub compaction_threshold: usize,
|
||||
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
pub wait_lsn_timeout: Duration,
|
||||
// How long to wait for WAL redo to complete.
|
||||
@@ -109,35 +138,6 @@ pub struct PageServerConf {
|
||||
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
|
||||
pub profiling: ProfilingConfig,
|
||||
pub default_tenant_conf: TenantConf,
|
||||
|
||||
/// A prefix to add in etcd brokers before every key.
|
||||
/// Can be used for isolating different pageserver groups withing the same etcd cluster.
|
||||
pub broker_etcd_prefix: String,
|
||||
|
||||
/// Etcd broker endpoints to connect to.
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum ProfilingConfig {
|
||||
Disabled,
|
||||
PageRequests,
|
||||
}
|
||||
|
||||
impl FromStr for ProfilingConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
|
||||
let result = match s {
|
||||
"disabled" => ProfilingConfig::Disabled,
|
||||
"page_requests" => ProfilingConfig::PageRequests,
|
||||
_ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
// use dedicated enum for builder to better indicate the intention
|
||||
@@ -162,6 +162,15 @@ struct PageServerConfigBuilder {
|
||||
|
||||
listen_http_addr: BuilderValue<String>,
|
||||
|
||||
checkpoint_distance: BuilderValue<u64>,
|
||||
|
||||
compaction_target_size: BuilderValue<u64>,
|
||||
compaction_period: BuilderValue<Duration>,
|
||||
compaction_threshold: BuilderValue<usize>,
|
||||
|
||||
gc_horizon: BuilderValue<u64>,
|
||||
gc_period: BuilderValue<Duration>,
|
||||
|
||||
wait_lsn_timeout: BuilderValue<Duration>,
|
||||
wal_redo_timeout: BuilderValue<Duration>,
|
||||
|
||||
@@ -181,10 +190,6 @@ struct PageServerConfigBuilder {
|
||||
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
|
||||
id: BuilderValue<ZNodeId>,
|
||||
|
||||
profiling: BuilderValue<ProfilingConfig>,
|
||||
broker_etcd_prefix: BuilderValue<String>,
|
||||
broker_endpoints: BuilderValue<Vec<Url>>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -194,6 +199,14 @@ impl Default for PageServerConfigBuilder {
|
||||
Self {
|
||||
listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||
listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||
checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
|
||||
compaction_target_size: Set(DEFAULT_COMPACTION_TARGET_SIZE),
|
||||
compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period")),
|
||||
compaction_threshold: Set(DEFAULT_COMPACTION_THRESHOLD),
|
||||
gc_horizon: Set(DEFAULT_GC_HORIZON),
|
||||
gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period")),
|
||||
wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
|
||||
.expect("cannot parse default wait lsn timeout")),
|
||||
wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
@@ -209,9 +222,6 @@ impl Default for PageServerConfigBuilder {
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
profiling: Set(ProfilingConfig::Disabled),
|
||||
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
|
||||
broker_endpoints: Set(Vec::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,6 +235,30 @@ impl PageServerConfigBuilder {
|
||||
self.listen_http_addr = BuilderValue::Set(listen_http_addr)
|
||||
}
|
||||
|
||||
pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) {
|
||||
self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
|
||||
}
|
||||
|
||||
pub fn compaction_target_size(&mut self, compaction_target_size: u64) {
|
||||
self.compaction_target_size = BuilderValue::Set(compaction_target_size)
|
||||
}
|
||||
|
||||
pub fn compaction_period(&mut self, compaction_period: Duration) {
|
||||
self.compaction_period = BuilderValue::Set(compaction_period)
|
||||
}
|
||||
|
||||
pub fn compaction_threshold(&mut self, compaction_threshold: usize) {
|
||||
self.compaction_threshold = BuilderValue::Set(compaction_threshold)
|
||||
}
|
||||
|
||||
pub fn gc_horizon(&mut self, gc_horizon: u64) {
|
||||
self.gc_horizon = BuilderValue::Set(gc_horizon)
|
||||
}
|
||||
|
||||
pub fn gc_period(&mut self, gc_period: Duration) {
|
||||
self.gc_period = BuilderValue::Set(gc_period)
|
||||
}
|
||||
|
||||
pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
|
||||
self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
|
||||
}
|
||||
@@ -268,70 +302,120 @@ impl PageServerConfigBuilder {
|
||||
self.remote_storage_config = BuilderValue::Set(remote_storage_config)
|
||||
}
|
||||
|
||||
pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) {
|
||||
self.broker_endpoints = BuilderValue::Set(broker_endpoints)
|
||||
}
|
||||
|
||||
pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
|
||||
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
|
||||
}
|
||||
|
||||
pub fn id(&mut self, node_id: ZNodeId) {
|
||||
self.id = BuilderValue::Set(node_id)
|
||||
}
|
||||
|
||||
pub fn profiling(&mut self, profiling: ProfilingConfig) {
|
||||
self.profiling = BuilderValue::Set(profiling)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let broker_endpoints = self
|
||||
.broker_endpoints
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?;
|
||||
|
||||
pub fn build(self) -> Result<PageServerConf> {
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
.ok_or(anyhow!("missing listen_pg_addr"))?,
|
||||
.ok_or(anyhow::anyhow!("missing listen_pg_addr"))?,
|
||||
listen_http_addr: self
|
||||
.listen_http_addr
|
||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
||||
.ok_or(anyhow::anyhow!("missing listen_http_addr"))?,
|
||||
checkpoint_distance: self
|
||||
.checkpoint_distance
|
||||
.ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
|
||||
compaction_target_size: self
|
||||
.compaction_target_size
|
||||
.ok_or(anyhow::anyhow!("missing compaction_target_size"))?,
|
||||
compaction_period: self
|
||||
.compaction_period
|
||||
.ok_or(anyhow::anyhow!("missing compaction_period"))?,
|
||||
compaction_threshold: self
|
||||
.compaction_threshold
|
||||
.ok_or(anyhow::anyhow!("missing compaction_threshold"))?,
|
||||
gc_horizon: self
|
||||
.gc_horizon
|
||||
.ok_or(anyhow::anyhow!("missing gc_horizon"))?,
|
||||
gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?,
|
||||
wait_lsn_timeout: self
|
||||
.wait_lsn_timeout
|
||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
||||
.ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?,
|
||||
wal_redo_timeout: self
|
||||
.wal_redo_timeout
|
||||
.ok_or(anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
|
||||
.ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?,
|
||||
page_cache_size: self
|
||||
.page_cache_size
|
||||
.ok_or(anyhow!("missing page_cache_size"))?,
|
||||
.ok_or(anyhow::anyhow!("missing page_cache_size"))?,
|
||||
max_file_descriptors: self
|
||||
.max_file_descriptors
|
||||
.ok_or(anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
|
||||
.ok_or(anyhow::anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?,
|
||||
pg_distrib_dir: self
|
||||
.pg_distrib_dir
|
||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
||||
auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?,
|
||||
.ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?,
|
||||
auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?,
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
||||
.ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?,
|
||||
remote_storage_config: self
|
||||
.remote_storage_config
|
||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
||||
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints,
|
||||
broker_etcd_prefix: self
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
.ok_or(anyhow::anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow::anyhow!("missing id"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between pageserver and the remote storage.
|
||||
pub max_concurrent_sync: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
//
|
||||
// Repository paths, relative to workdir.
|
||||
@@ -369,16 +453,24 @@ impl PageServerConf {
|
||||
/// validating the input and failing on errors.
|
||||
///
|
||||
/// This leaves any options not present in the file in the built-in defaults.
|
||||
pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result<Self> {
|
||||
pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result<Self> {
|
||||
let mut builder = PageServerConfigBuilder::default();
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
|
||||
for (key, item) in toml.iter() {
|
||||
match key {
|
||||
"listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
|
||||
"listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
|
||||
"checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
|
||||
"compaction_target_size" => {
|
||||
builder.compaction_target_size(parse_toml_u64(key, item)?)
|
||||
}
|
||||
"compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?),
|
||||
"compaction_threshold" => {
|
||||
builder.compaction_threshold(parse_toml_u64(key, item)? as usize)
|
||||
}
|
||||
"gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
|
||||
"gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
|
||||
"wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
|
||||
"wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
|
||||
"initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
|
||||
@@ -392,27 +484,12 @@ impl PageServerConf {
|
||||
"auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
|
||||
PathBuf::from(parse_toml_string(key, item)?),
|
||||
)),
|
||||
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
||||
"auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?),
|
||||
"remote_storage" => {
|
||||
builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
|
||||
}
|
||||
"tenant_config" => {
|
||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||
}
|
||||
"id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
|
||||
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
||||
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
|
||||
"broker_endpoints" => builder.broker_endpoints(
|
||||
parse_toml_array(key, item)?
|
||||
.into_iter()
|
||||
.map(|endpoint_str| {
|
||||
endpoint_str.parse::<Url>().with_context(|| {
|
||||
format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<_>>()?,
|
||||
),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
_ => bail!("unrecognized pageserver option '{}'", key),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -438,75 +515,41 @@ impl PageServerConf {
|
||||
);
|
||||
}
|
||||
|
||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
// subroutine of parse_and_validate to parse `[tenant_conf]` section
|
||||
|
||||
pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
|
||||
t_conf.checkpoint_distance =
|
||||
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_target_size) = item.get("compaction_target_size") {
|
||||
t_conf.compaction_target_size = Some(parse_toml_u64(
|
||||
"compaction_target_size",
|
||||
compaction_target_size,
|
||||
)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_period) = item.get("compaction_period") {
|
||||
t_conf.compaction_period =
|
||||
Some(parse_toml_duration("compaction_period", compaction_period)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_threshold) = item.get("compaction_threshold") {
|
||||
t_conf.compaction_threshold =
|
||||
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
|
||||
}
|
||||
|
||||
if let Some(gc_horizon) = item.get("gc_horizon") {
|
||||
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
||||
}
|
||||
|
||||
if let Some(gc_period) = item.get("gc_period") {
|
||||
t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
|
||||
}
|
||||
|
||||
if let Some(pitr_interval) = item.get("pitr_interval") {
|
||||
t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
/// subroutine of parse_config(), to parse the `[remote_storage]` table.
|
||||
fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
let bucket_region = toml.get("bucket_region");
|
||||
|
||||
let max_concurrent_syncs = NonZeroUsize::new(
|
||||
parse_optional_integer("max_concurrent_syncs", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
|
||||
)
|
||||
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
|
||||
|
||||
let max_sync_errors = NonZeroU32::new(
|
||||
parse_optional_integer("max_sync_errors", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
||||
)
|
||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
||||
|
||||
let concurrency_limit = NonZeroUsize::new(
|
||||
parse_optional_integer("concurrency_limit", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
let max_concurrent_sync: NonZeroUsize = if let Some(s) = toml.get("max_concurrent_sync") {
|
||||
parse_toml_u64("max_concurrent_sync", s)
|
||||
.and_then(|toml_u64| {
|
||||
toml_u64.try_into().with_context(|| {
|
||||
format!("'max_concurrent_sync' value {} is too large", toml_u64)
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.and_then(NonZeroUsize::new)
|
||||
.context("'max_concurrent_sync' must be a non-zero positive integer")?
|
||||
} else {
|
||||
NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap()
|
||||
};
|
||||
let max_sync_errors: NonZeroU32 = if let Some(s) = toml.get("max_sync_errors") {
|
||||
parse_toml_u64("max_sync_errors", s)
|
||||
.and_then(|toml_u64| {
|
||||
toml_u64.try_into().with_context(|| {
|
||||
format!("'max_sync_errors' value {} is too large", toml_u64)
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.and_then(NonZeroU32::new)
|
||||
.context("'max_sync_errors' must be a non-zero positive integer")?
|
||||
} else {
|
||||
NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap()
|
||||
};
|
||||
|
||||
let storage = match (local_path, bucket_name, bucket_region) {
|
||||
(None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
|
||||
@@ -519,6 +562,16 @@ impl PageServerConf {
|
||||
(None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
||||
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
||||
access_key_id: toml
|
||||
.get("access_key_id")
|
||||
.map(|access_key_id| parse_toml_string("access_key_id", access_key_id))
|
||||
.transpose()?,
|
||||
secret_access_key: toml
|
||||
.get("secret_access_key")
|
||||
.map(|secret_access_key| {
|
||||
parse_toml_string("secret_access_key", secret_access_key)
|
||||
})
|
||||
.transpose()?,
|
||||
prefix_in_bucket: toml
|
||||
.get("prefix_in_bucket")
|
||||
.map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
|
||||
@@ -527,7 +580,6 @@ impl PageServerConf {
|
||||
.get("endpoint")
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
}),
|
||||
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
|
||||
parse_toml_string("local_path", local_path)?,
|
||||
@@ -536,7 +588,7 @@ impl PageServerConf {
|
||||
};
|
||||
|
||||
Ok(RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
})
|
||||
@@ -544,13 +596,19 @@ impl PageServerConf {
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> PathBuf {
|
||||
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||
PathBuf::from(format!("../tmp_check/test_{}", test_name))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||
PageServerConf {
|
||||
id: ZNodeId(0),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: 4 * 1024 * 1024,
|
||||
compaction_period: Duration::from_secs(10),
|
||||
compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
wal_redo_timeout: Duration::from_secs(60),
|
||||
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
|
||||
@@ -563,10 +621,6 @@ impl PageServerConf {
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::dummy_conf(),
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -576,7 +630,7 @@ impl PageServerConf {
|
||||
fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
|
||||
let s = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
Ok(s.to_string())
|
||||
}
|
||||
|
||||
@@ -585,68 +639,26 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
|
||||
// for our use, though.
|
||||
let i: i64 = item
|
||||
.as_integer()
|
||||
.with_context(|| format!("configure option {name} is not an integer"))?;
|
||||
.with_context(|| format!("configure option {} is not an integer", name))?;
|
||||
if i < 0 {
|
||||
bail!("configure option {name} cannot be negative");
|
||||
bail!("configure option {} cannot be negative", name);
|
||||
}
|
||||
Ok(i as u64)
|
||||
}
|
||||
|
||||
fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
|
||||
where
|
||||
I: TryFrom<i64, Error = E>,
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
let toml_integer = match item.get(name) {
|
||||
Some(item) => item
|
||||
.as_integer()
|
||||
.with_context(|| format!("configure option {name} is not an integer"))?,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
I::try_from(toml_integer)
|
||||
.map(Some)
|
||||
.with_context(|| format!("configure option {name} is too large"))
|
||||
}
|
||||
|
||||
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
|
||||
let s = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
|
||||
Ok(humantime::parse_duration(s)?)
|
||||
}
|
||||
|
||||
fn parse_toml_from_str<T>(name: &str, item: &Item) -> anyhow::Result<T>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::fmt::Display,
|
||||
{
|
||||
fn parse_toml_auth_type(name: &str, item: &Item) -> Result<AuthType> {
|
||||
let v = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
T::from_str(v).map_err(|e| {
|
||||
anyhow!(
|
||||
"Failed to parse string as {parse_type} for configure option {name}: {e}",
|
||||
parse_type = stringify!(T)
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
|
||||
let array = item
|
||||
.as_array()
|
||||
.with_context(|| format!("configure option {name} is not an array"))?;
|
||||
|
||||
array
|
||||
.iter()
|
||||
.map(|value| {
|
||||
value
|
||||
.as_str()
|
||||
.map(str::to_string)
|
||||
.with_context(|| format!("Array item {value:?} for key {name} is not a string"))
|
||||
})
|
||||
.collect()
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
AuthType::from_str(v)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -663,6 +675,15 @@ mod tests {
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
|
||||
checkpoint_distance = 111 # in bytes
|
||||
|
||||
compaction_target_size = 111 # in bytes
|
||||
compaction_period = '111 s'
|
||||
compaction_threshold = 2
|
||||
|
||||
gc_period = '222 s'
|
||||
gc_horizon = 222
|
||||
|
||||
wait_lsn_timeout = '111 s'
|
||||
wal_redo_timeout = '111 s'
|
||||
|
||||
@@ -679,16 +700,14 @@ id = 10
|
||||
fn parse_defaults() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
// we have to create dummy values to overcome the validation errors
|
||||
let config_string = format!(
|
||||
"pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
// we have to create dummy pathes to overcome the validation errors
|
||||
let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display());
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
|
||||
let parsed_config =
|
||||
PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
parsed_config,
|
||||
@@ -696,6 +715,12 @@ id = 10
|
||||
id: ZNodeId(10),
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: defaults::DEFAULT_COMPACTION_TARGET_SIZE,
|
||||
compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?,
|
||||
compaction_threshold: defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
|
||||
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
|
||||
wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
|
||||
superuser: defaults::DEFAULT_SUPERUSER.to_string(),
|
||||
@@ -706,12 +731,6 @@ id = 10
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -723,16 +742,18 @@ id = 10
|
||||
fn parse_basic_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
|
||||
let config_string = format!(
|
||||
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
"{}pg_distrib_dir='{}'",
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
|
||||
let parsed_config =
|
||||
PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
parsed_config,
|
||||
@@ -740,6 +761,12 @@ id = 10
|
||||
id: ZNodeId(10),
|
||||
listen_pg_addr: "127.0.0.1:64000".to_string(),
|
||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||
checkpoint_distance: 111,
|
||||
compaction_target_size: 111,
|
||||
compaction_period: Duration::from_secs(111),
|
||||
compaction_threshold: 2,
|
||||
gc_horizon: 222,
|
||||
gc_period: Duration::from_secs(222),
|
||||
wait_lsn_timeout: Duration::from_secs(111),
|
||||
wal_redo_timeout: Duration::from_secs(111),
|
||||
superuser: "zzzz".to_string(),
|
||||
@@ -750,12 +777,6 @@ id = 10
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -767,7 +788,6 @@ id = 10
|
||||
fn parse_remote_fs_storage_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
|
||||
let local_storage_path = tempdir.path().join("local_remote_storage");
|
||||
|
||||
@@ -785,36 +805,37 @@ local_path = '{}'"#,
|
||||
|
||||
for remote_storage_config_str in identical_toml_declarations {
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
r#"{}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
{}"#,
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display(),
|
||||
remote_storage_config_str,
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for the local FS");
|
||||
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_sync: NonZeroUsize::new(
|
||||
defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -827,44 +848,47 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
let bucket_name = "some-sample-bucket".to_string();
|
||||
let bucket_region = "eu-north-1".to_string();
|
||||
let prefix_in_bucket = "test_prefix".to_string();
|
||||
let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
|
||||
let secret_access_key = "SOMEsEcReTsd292v".to_string();
|
||||
let endpoint = "http://localhost:5000".to_string();
|
||||
let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
|
||||
let max_concurrent_sync = NonZeroUsize::new(111).unwrap();
|
||||
let max_sync_errors = NonZeroU32::new(222).unwrap();
|
||||
let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
|
||||
let identical_toml_declarations = &[
|
||||
format!(
|
||||
r#"[remote_storage]
|
||||
max_concurrent_syncs = {max_concurrent_syncs}
|
||||
max_sync_errors = {max_sync_errors}
|
||||
bucket_name = '{bucket_name}'
|
||||
bucket_region = '{bucket_region}'
|
||||
prefix_in_bucket = '{prefix_in_bucket}'
|
||||
endpoint = '{endpoint}'
|
||||
concurrency_limit = {s3_concurrency_limit}"#
|
||||
max_concurrent_sync = {}
|
||||
max_sync_errors = {}
|
||||
bucket_name = '{}'
|
||||
bucket_region = '{}'
|
||||
prefix_in_bucket = '{}'
|
||||
access_key_id = '{}'
|
||||
secret_access_key = '{}'
|
||||
endpoint = '{}'"#,
|
||||
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
|
||||
),
|
||||
format!(
|
||||
"remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
|
||||
bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
|
||||
"remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}",
|
||||
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
|
||||
),
|
||||
];
|
||||
|
||||
for remote_storage_config_str in identical_toml_declarations {
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
r#"{}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
{}"#,
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display(),
|
||||
remote_storage_config_str,
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for S3");
|
||||
@@ -872,14 +896,15 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: bucket_name.clone(),
|
||||
bucket_region: bucket_region.clone(),
|
||||
access_key_id: Some(access_key_id.clone()),
|
||||
secret_access_key: Some(secret_access_key.clone()),
|
||||
prefix_in_bucket: Some(prefix_in_bucket.clone()),
|
||||
endpoint: Some(endpoint.clone()),
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
endpoint: Some(endpoint.clone())
|
||||
}),
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::{
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
@@ -20,19 +20,11 @@ pub struct TimelineCreateRequest {
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Default)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub new_tenant_id: Option<ZTenantId>,
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
pub pitr_interval: Option<String>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -44,44 +36,3 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId
|
||||
pub struct StatusResponse {
|
||||
pub id: ZNodeId,
|
||||
}
|
||||
|
||||
impl TenantCreateRequest {
|
||||
pub fn new(new_tenant_id: Option<ZTenantId>) -> TenantCreateRequest {
|
||||
TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantConfigRequest {
|
||||
pub tenant_id: ZTenantId,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
pub pitr_interval: Option<String>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest {
|
||||
TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: None,
|
||||
compaction_target_size: None,
|
||||
compaction_period: None,
|
||||
compaction_threshold: None,
|
||||
gc_horizon: None,
|
||||
gc_period: None,
|
||||
image_creation_threshold: None,
|
||||
pitr_interval: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,53 +123,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: Get wal receiver's data attached to the timeline
|
||||
responses:
|
||||
"200":
|
||||
description: WalReceiverEntry
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/WalReceiverEntry"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Error when no wal receiver is running or found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/attach:
|
||||
parameters:
|
||||
@@ -375,7 +328,11 @@ paths:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantCreateInfo"
|
||||
type: object
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
responses:
|
||||
"201":
|
||||
description: New tenant created successfully
|
||||
@@ -414,48 +371,7 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
description: |
|
||||
Update tenant's config.
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Malformed tenant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
@@ -473,45 +389,6 @@ components:
|
||||
type: string
|
||||
state:
|
||||
type: string
|
||||
TenantCreateInfo:
|
||||
type: object
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
gc_period:
|
||||
type: string
|
||||
gc_horizon:
|
||||
type: integer
|
||||
pitr_interval:
|
||||
type: string
|
||||
checkpoint_distance:
|
||||
type: integer
|
||||
compaction_period:
|
||||
type: string
|
||||
compaction_threshold:
|
||||
type: string
|
||||
TenantConfigInfo:
|
||||
type: object
|
||||
properties:
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
gc_period:
|
||||
type: string
|
||||
gc_horizon:
|
||||
type: integer
|
||||
pitr_interval:
|
||||
type: string
|
||||
checkpoint_distance:
|
||||
type: integer
|
||||
compaction_period:
|
||||
type: string
|
||||
compaction_threshold:
|
||||
type: string
|
||||
TimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
@@ -532,7 +409,6 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- awaits_download
|
||||
- remote_consistent_lsn
|
||||
properties:
|
||||
awaits_download:
|
||||
type: boolean
|
||||
@@ -567,21 +443,6 @@ components:
|
||||
type: integer
|
||||
current_logical_size_non_incremental:
|
||||
type: integer
|
||||
WalReceiverEntry:
|
||||
type: object
|
||||
required:
|
||||
- thread_id
|
||||
- wal_producer_connstr
|
||||
properties:
|
||||
thread_id:
|
||||
type: integer
|
||||
wal_producer_connstr:
|
||||
type: string
|
||||
last_received_msg_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
last_received_msg_ts:
|
||||
type: integer
|
||||
|
||||
Error:
|
||||
type: object
|
||||
|
||||
@@ -1,39 +1,36 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
use zenith_utils::http::endpoint::auth_middleware;
|
||||
use zenith_utils::http::endpoint::check_permission;
|
||||
use zenith_utils::http::error::ApiError;
|
||||
use zenith_utils::http::{
|
||||
endpoint,
|
||||
error::HttpErrorBody,
|
||||
json::{json_request, json_response},
|
||||
request::parse_request_param,
|
||||
};
|
||||
use zenith_utils::http::{RequestExt, RouterBuilder};
|
||||
use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse,
|
||||
TimelineCreateRequest,
|
||||
StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest,
|
||||
};
|
||||
use crate::remote_storage::{schedule_timeline_download, RemoteIndex};
|
||||
use crate::repository::Repository;
|
||||
use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
json::{json_request, json_response},
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -41,27 +38,17 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> Self {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
// Note that this remote storage is created separately from the main one in the sync_loop.
|
||||
// It's fine since it's stateless and some code duplication saves us from bloating the code around with generics.
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
|
||||
Ok(Self {
|
||||
Self {
|
||||
conf,
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_index,
|
||||
remote_storage,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,8 +122,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -166,47 +153,43 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
|
||||
|
||||
let (local_timeline_info, remote_timeline_info) = async {
|
||||
// any error here will render local timeline as None
|
||||
// XXX .in_current_span does not attach messages in spawn_blocking future to current future's span
|
||||
let local_timeline_info = tokio::task::spawn_blocking(move || {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
};
|
||||
Ok::<_, anyhow::Error>(local_timeline)
|
||||
})
|
||||
.await
|
||||
.ok()
|
||||
.and_then(|r| r.ok())
|
||||
.flatten();
|
||||
let span = info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id);
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
let (local_timeline_info, span) = tokio::task::spawn_blocking(move || {
|
||||
let entered = span.entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
};
|
||||
(local_timeline_info, remote_timeline_info)
|
||||
}
|
||||
.instrument(info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
Ok::<_, anyhow::Error>((local_timeline, entered.exit()))
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
})
|
||||
};
|
||||
|
||||
let _enter = span.entered();
|
||||
|
||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||
return Err(ApiError::NotFound(
|
||||
@@ -224,129 +207,44 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
json_response(StatusCode::OK, timeline_info)
|
||||
}
|
||||
|
||||
async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let wal_receiver = tokio::task::spawn_blocking(move || {
|
||||
let _enter =
|
||||
info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id).entered();
|
||||
|
||||
crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?
|
||||
.ok_or_else(|| {
|
||||
ApiError::NotFound(format!(
|
||||
"WAL receiver not found for tenant {} and timeline {}",
|
||||
tenant_id, timeline_id
|
||||
))
|
||||
})?;
|
||||
|
||||
json_response(StatusCode::OK, wal_receiver)
|
||||
}
|
||||
|
||||
async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
info!(
|
||||
"Handling timeline {} attach for tenant: {}",
|
||||
timeline_id, tenant_id,
|
||||
);
|
||||
let span = info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id);
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() {
|
||||
let span = tokio::task::spawn_blocking(move || {
|
||||
let entered = span.entered();
|
||||
if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() {
|
||||
// TODO: maybe answer with 309 Not Modified here?
|
||||
anyhow::bail!("Timeline is already present locally")
|
||||
};
|
||||
Ok(())
|
||||
Ok(entered.exit())
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let sync_id = ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
let mut remote_index_write = get_state(&request).remote_index.write().await;
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
remote_timeline.awaits_download = true;
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
} else {
|
||||
// no timeline in the index, release the lock to make the potentially lengthy download opetation
|
||||
drop(index_accessor);
|
||||
}
|
||||
|
||||
let new_timeline = match try_download_index_part_data(state, sync_id).await {
|
||||
Ok(Some(mut new_timeline)) => {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
new_timeline.awaits_download = true;
|
||||
new_timeline
|
||||
}
|
||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote timeline data: {:?}", e);
|
||||
return Err(ApiError::NotFound(
|
||||
"Failed to retrieve remote timeline".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
match index_accessor.timeline_entry_mut(&sync_id) {
|
||||
Some(remote_timeline) => {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
remote_timeline.awaits_download = true;
|
||||
}
|
||||
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
|
||||
}
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn try_download_index_part_data(
|
||||
state: &State,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<Option<RemoteTimeline>> {
|
||||
let index_part = match state.remote_storage.as_ref() {
|
||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||
storage_sync::download_index_part(state.conf, local_storage, sync_id).await
|
||||
}
|
||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||
storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
.with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;
|
||||
|
||||
let timeline_path = state
|
||||
.conf
|
||||
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.map(Some)
|
||||
.with_context(|| {
|
||||
format!("Failed to convert index part into remote timeline for timeline {sync_id}")
|
||||
let _enter = span.entered(); // entered guard cannot live across awaits (non Send)
|
||||
let index_entry = remote_index_write
|
||||
.timeline_entry_mut(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.ok_or_else(|| ApiError::NotFound("Unknown remote timeline".to_string()))?;
|
||||
|
||||
if index_entry.get_awaits_download() {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
index_entry.set_awaits_download(true);
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -359,8 +257,8 @@ async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let _enter =
|
||||
info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
|
||||
.entered();
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id)
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
repo.detach_timeline(timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -377,7 +275,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
crate::tenant_mgr::list_tenants()
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -389,28 +287,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
let remote_index = get_state(&request).remote_index.clone();
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
let target_tenant_id = request_data
|
||||
.new_tenant_id
|
||||
.map(ZTenantId::from)
|
||||
@@ -418,9 +294,8 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let new_tenant_id = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
|
||||
let conf = get_config(&request);
|
||||
|
||||
tenant_mgr::create_tenant_repository(conf, tenant_conf, target_tenant_id, remote_index)
|
||||
tenant_mgr::create_tenant_repository(get_config(&request), target_tenant_id, remote_index)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -431,45 +306,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
})
|
||||
}
|
||||
|
||||
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantConfigRequest = json_request(&mut request).await?;
|
||||
let tenant_id = request_data.tenant_id;
|
||||
// check for management permission
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let mut tenant_conf: TenantConfOpt = Default::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
|
||||
|
||||
tenant_mgr::update_tenant_config(tenant_conf, tenant_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -481,7 +317,7 @@ pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
if auth.is_some() {
|
||||
@@ -495,24 +331,17 @@ pub fn make_router(
|
||||
}))
|
||||
}
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_index).context("Failed to initialize router state")?,
|
||||
))
|
||||
router
|
||||
.data(Arc::new(State::new(conf, auth, remote_index)))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.put("/v1/tenant/config", tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
|
||||
wal_receiver_get_handler,
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
|
||||
timeline_attach_handler,
|
||||
@@ -521,5 +350,5 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
||||
timeline_detach_handler,
|
||||
)
|
||||
.any(handler_404))
|
||||
.any(handler_404)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
//! Import data and WAL from a PostgreSQL data directory and WAL segments into
|
||||
//! a zenith Timeline.
|
||||
//!
|
||||
use std::fs;
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -9,19 +10,17 @@ use std::path::{Path, PathBuf};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use tracing::*;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::reltag::{RelTag, SlruKind};
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
use postgres_ffi::waldecoder::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::Oid;
|
||||
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||
use utils::lsn::Lsn;
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
@@ -36,30 +35,100 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
||||
) -> Result<()> {
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
// TODO this shoud be start_lsn, which is not necessarily equal to end_lsn (aka lsn)
|
||||
// Then fishing out pg_control would be unnecessary
|
||||
let mut modification = tline.begin_modification(lsn);
|
||||
modification.init_empty()?;
|
||||
|
||||
// Import all but pg_wal
|
||||
let all_but_wal = WalkDir::new(path)
|
||||
.into_iter()
|
||||
.filter_entry(|entry| !entry.path().ends_with("pg_wal"));
|
||||
for entry in all_but_wal {
|
||||
let entry = entry?;
|
||||
let metadata = entry.metadata().expect("error getting dir entry metadata");
|
||||
if metadata.is_file() {
|
||||
let absolute_path = entry.path();
|
||||
let relative_path = absolute_path.strip_prefix(path)?;
|
||||
// Scan 'global'
|
||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
||||
for direntry in fs::read_dir(path.join("global"))? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
let file = File::open(absolute_path)?;
|
||||
let len = metadata.len() as usize;
|
||||
if let Some(control_file) = import_file(&mut modification, relative_path, file, len)? {
|
||||
pg_control = Some(control_file);
|
||||
Some("pg_control") => {
|
||||
pg_control = Some(import_control_file(&mut modification, &direntry.path())?);
|
||||
}
|
||||
modification.flush()?;
|
||||
Some("pg_filenode.map") => {
|
||||
import_relmap_file(
|
||||
&mut modification,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
&direntry.path(),
|
||||
)?;
|
||||
}
|
||||
|
||||
// Load any relation files into the page server (but only after the other files)
|
||||
_ => relfiles.push(direntry.path()),
|
||||
}
|
||||
}
|
||||
for relfile in relfiles {
|
||||
import_relfile(
|
||||
&mut modification,
|
||||
&relfile,
|
||||
pg_constants::GLOBALTABLESPACE_OID,
|
||||
0,
|
||||
)?;
|
||||
}
|
||||
|
||||
// Scan 'base'. It contains database dirs, the database OID is the filename.
|
||||
// E.g. 'base/12345', where 12345 is the database OID.
|
||||
for direntry in fs::read_dir(path.join("base"))? {
|
||||
let direntry = direntry?;
|
||||
|
||||
//skip all temporary files
|
||||
if direntry.file_name().to_string_lossy() == "pgsql_tmp" {
|
||||
continue;
|
||||
}
|
||||
|
||||
let dboid = direntry.file_name().to_string_lossy().parse::<u32>()?;
|
||||
|
||||
let mut relfiles: Vec<PathBuf> = Vec::new();
|
||||
for direntry in fs::read_dir(direntry.path())? {
|
||||
let direntry = direntry?;
|
||||
match direntry.file_name().to_str() {
|
||||
None => continue,
|
||||
|
||||
Some("PG_VERSION") => {
|
||||
//modification.put_dbdir_creation(pg_constants::DEFAULTTABLESPACE_OID, dboid)?;
|
||||
}
|
||||
Some("pg_filenode.map") => import_relmap_file(
|
||||
&mut modification,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
&direntry.path(),
|
||||
)?,
|
||||
|
||||
// Load any relation files into the page server
|
||||
_ => relfiles.push(direntry.path()),
|
||||
}
|
||||
}
|
||||
for relfile in relfiles {
|
||||
import_relfile(
|
||||
&mut modification,
|
||||
&relfile,
|
||||
pg_constants::DEFAULTTABLESPACE_OID,
|
||||
dboid,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_xact"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(&mut modification, SlruKind::Clog, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("members"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(&mut modification, SlruKind::MultiXactMembers, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_multixact").join("offsets"))? {
|
||||
let entry = entry?;
|
||||
import_slru_file(&mut modification, SlruKind::MultiXactOffsets, &entry.path())?;
|
||||
}
|
||||
for entry in fs::read_dir(path.join("pg_twophase"))? {
|
||||
let entry = entry?;
|
||||
let xid = u32::from_str_radix(&entry.path().to_string_lossy(), 16)?;
|
||||
import_twophase_file(&mut modification, xid, &entry.path())?;
|
||||
}
|
||||
// TODO: Scan pg_tblspc
|
||||
|
||||
// We're done importing all the data files.
|
||||
modification.commit()?;
|
||||
@@ -89,30 +158,31 @@ pub fn import_timeline_from_postgres_datadir<R: Repository>(
|
||||
}
|
||||
|
||||
// subroutine of import_timeline_from_postgres_datadir(), to load one relation file.
|
||||
fn import_rel<R: Repository, Reader: Read>(
|
||||
fn import_relfile<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
path: &Path,
|
||||
spcoid: Oid,
|
||||
dboid: Oid,
|
||||
mut reader: Reader,
|
||||
len: usize,
|
||||
) -> anyhow::Result<()> {
|
||||
// Does it look like a relation file?
|
||||
trace!("importing rel file {}", path.display());
|
||||
|
||||
let filename = &path
|
||||
.file_name()
|
||||
.expect("missing rel filename")
|
||||
.to_string_lossy();
|
||||
let (relnode, forknum, segno) = parse_relfilename(filename).map_err(|e| {
|
||||
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
||||
e
|
||||
})?;
|
||||
let (relnode, forknum, segno) = parse_relfilename(&path.file_name().unwrap().to_string_lossy())
|
||||
.map_err(|e| {
|
||||
warn!("unrecognized file in postgres datadir: {:?} ({})", path, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
|
||||
ensure!(len % pg_constants::BLCKSZ as usize == 0);
|
||||
let nblocks = len / pg_constants::BLCKSZ as usize;
|
||||
let len = file.metadata().unwrap().len();
|
||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0);
|
||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
||||
|
||||
if segno != 0 {
|
||||
todo!();
|
||||
}
|
||||
|
||||
let rel = RelTag {
|
||||
spcnode: spcoid,
|
||||
@@ -120,22 +190,11 @@ fn import_rel<R: Repository, Reader: Read>(
|
||||
relnode,
|
||||
forknum,
|
||||
};
|
||||
modification.put_rel_creation(rel, nblocks as u32)?;
|
||||
|
||||
let mut blknum: u32 = segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
|
||||
// Call put_rel_creation for every segment of the relation,
|
||||
// because there is no guarantee about the order in which we are processing segments.
|
||||
// ignore "relation already exists" error
|
||||
if let Err(e) = modification.put_rel_creation(rel, nblocks as u32) {
|
||||
if e.to_string().contains("already exists") {
|
||||
debug!("relation {} already exists. we must be extending it", rel);
|
||||
} else {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
let r = reader.read_exact(&mut buf);
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
modification.put_rel_page_image(rel, blknum, Bytes::copy_from_slice(&buf))?;
|
||||
@@ -145,9 +204,7 @@ fn import_rel<R: Repository, Reader: Read>(
|
||||
Err(err) => match err.kind() {
|
||||
std::io::ErrorKind::UnexpectedEof => {
|
||||
// reached EOF. That's expected.
|
||||
let relative_blknum =
|
||||
blknum - segno * (1024 * 1024 * 1024 / pg_constants::BLCKSZ as u32);
|
||||
ensure!(relative_blknum == nblocks as u32, "unexpected EOF");
|
||||
ensure!(blknum == nblocks as u32, "unexpected EOF");
|
||||
break;
|
||||
}
|
||||
_ => {
|
||||
@@ -158,43 +215,96 @@ fn import_rel<R: Repository, Reader: Read>(
|
||||
blknum += 1;
|
||||
}
|
||||
|
||||
// Update relation size
|
||||
//
|
||||
// If we process rel segments out of order,
|
||||
// put_rel_extend will skip the update.
|
||||
modification.put_rel_extend(rel, blknum)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Import a relmapper (pg_filenode.map) file into the repository
|
||||
fn import_relmap_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
spcnode: Oid,
|
||||
dbnode: Oid,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing relmap file {}", path.display());
|
||||
|
||||
modification.put_relmap_file(spcnode, dbnode, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Import a twophase state file (pg_twophase/<xid>) into the repository
|
||||
fn import_twophase_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
xid: TransactionId,
|
||||
path: &Path,
|
||||
) -> Result<()> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing non-rel file {}", path.display());
|
||||
|
||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&buffer[..]))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Import pg_control file into the repository.
|
||||
///
|
||||
/// The control file is imported as is, but we also extract the checkpoint record
|
||||
/// from it and store it separated.
|
||||
fn import_control_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
path: &Path,
|
||||
) -> Result<ControlFileData> {
|
||||
let mut file = File::open(path)?;
|
||||
let mut buffer = Vec::new();
|
||||
// read the whole file
|
||||
file.read_to_end(&mut buffer)?;
|
||||
|
||||
trace!("importing control file {}", path.display());
|
||||
|
||||
// Import it as ControlFile
|
||||
modification.put_control_file(Bytes::copy_from_slice(&buffer[..]))?;
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&buffer)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
modification.put_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
Ok(pg_control)
|
||||
}
|
||||
|
||||
///
|
||||
/// Import an SLRU segment file
|
||||
///
|
||||
fn import_slru<R: Repository, Reader: Read>(
|
||||
fn import_slru_file<R: Repository>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
slru: SlruKind,
|
||||
path: &Path,
|
||||
mut reader: Reader,
|
||||
len: usize,
|
||||
) -> Result<()> {
|
||||
trace!("importing slru file {}", path.display());
|
||||
|
||||
let mut file = File::open(path)?;
|
||||
let mut buf: [u8; 8192] = [0u8; 8192];
|
||||
let filename = &path
|
||||
.file_name()
|
||||
.expect("missing slru filename")
|
||||
.to_string_lossy();
|
||||
let segno = u32::from_str_radix(filename, 16)?;
|
||||
let segno = u32::from_str_radix(&path.file_name().unwrap().to_string_lossy(), 16)?;
|
||||
|
||||
ensure!(len % pg_constants::BLCKSZ as usize == 0); // we assume SLRU block size is the same as BLCKSZ
|
||||
let nblocks = len / pg_constants::BLCKSZ as usize;
|
||||
let len = file.metadata().unwrap().len();
|
||||
ensure!(len % pg_constants::BLCKSZ as u64 == 0); // we assume SLRU block size is the same as BLCKSZ
|
||||
let nblocks = len / pg_constants::BLCKSZ as u64;
|
||||
|
||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
|
||||
ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as u64);
|
||||
|
||||
modification.put_slru_segment_creation(slru, segno, nblocks as u32)?;
|
||||
|
||||
let mut rpageno = 0;
|
||||
loop {
|
||||
let r = reader.read_exact(&mut buf);
|
||||
let r = file.read_exact(&mut buf);
|
||||
match r {
|
||||
Ok(_) => {
|
||||
modification.put_slru_page_image(
|
||||
@@ -286,272 +396,10 @@ fn import_wal<R: Repository>(
|
||||
}
|
||||
|
||||
if last_lsn != startpoint {
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
debug!("reached end of WAL at {}", last_lsn);
|
||||
} else {
|
||||
info!("no WAL to import at {}", last_lsn);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_basebackup_from_tar<R: Repository, Reader: Read>(
|
||||
tline: &mut DatadirTimeline<R>,
|
||||
reader: Reader,
|
||||
base_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
info!("importing base at {}", base_lsn);
|
||||
let mut modification = tline.begin_modification(base_lsn);
|
||||
modification.init_empty()?;
|
||||
|
||||
let mut pg_control: Option<ControlFileData> = None;
|
||||
|
||||
// Import base
|
||||
for base_tar_entry in tar::Archive::new(reader).entries()? {
|
||||
let entry = base_tar_entry?;
|
||||
let header = entry.header();
|
||||
let len = header.entry_size()? as usize;
|
||||
let file_path = header.path()?.into_owned();
|
||||
|
||||
match header.entry_type() {
|
||||
tar::EntryType::Regular => {
|
||||
if let Some(res) = import_file(&mut modification, file_path.as_ref(), entry, len)? {
|
||||
// We found the pg_control file.
|
||||
pg_control = Some(res);
|
||||
}
|
||||
modification.flush()?;
|
||||
}
|
||||
tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
}
|
||||
_ => {
|
||||
panic!("tar::EntryType::?? {}", file_path.display());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sanity check: ensure that pg_control is loaded
|
||||
let _pg_control = pg_control.context("pg_control file not found")?;
|
||||
|
||||
modification.commit()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_wal_from_tar<R: Repository, Reader: Read>(
|
||||
tline: &mut DatadirTimeline<R>,
|
||||
reader: Reader,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
) -> Result<()> {
|
||||
// Set up walingest mutable state
|
||||
let mut waldecoder = WalStreamDecoder::new(start_lsn);
|
||||
let mut segno = start_lsn.segment_number(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut offset = start_lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE);
|
||||
let mut last_lsn = start_lsn;
|
||||
let mut walingest = WalIngest::new(tline, start_lsn)?;
|
||||
|
||||
// Ingest wal until end_lsn
|
||||
info!("importing wal until {}", end_lsn);
|
||||
let mut pg_wal_tar = tar::Archive::new(reader);
|
||||
let mut pg_wal_entries_iter = pg_wal_tar.entries()?;
|
||||
while last_lsn <= end_lsn {
|
||||
let bytes = {
|
||||
let entry = pg_wal_entries_iter.next().expect("expected more wal")?;
|
||||
let header = entry.header();
|
||||
let file_path = header.path()?.into_owned();
|
||||
|
||||
match header.entry_type() {
|
||||
tar::EntryType::Regular => {
|
||||
// FIXME: assume postgresql tli 1 for now
|
||||
let expected_filename = XLogFileName(1, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let file_name = file_path
|
||||
.file_name()
|
||||
.expect("missing wal filename")
|
||||
.to_string_lossy();
|
||||
ensure!(expected_filename == file_name);
|
||||
|
||||
debug!("processing wal file {:?}", file_path);
|
||||
read_all_bytes(entry)?
|
||||
}
|
||||
tar::EntryType::Directory => {
|
||||
debug!("directory {:?}", file_path);
|
||||
continue;
|
||||
}
|
||||
_ => {
|
||||
panic!("tar::EntryType::?? {}", file_path.display());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
waldecoder.feed_bytes(&bytes[offset..]);
|
||||
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
walingest.ingest_record(tline, recdata, lsn)?;
|
||||
last_lsn = lsn;
|
||||
|
||||
debug!("imported record at {} (end {})", lsn, end_lsn);
|
||||
}
|
||||
}
|
||||
|
||||
debug!("imported records up to {}", last_lsn);
|
||||
segno += 1;
|
||||
offset = 0;
|
||||
}
|
||||
|
||||
if last_lsn != start_lsn {
|
||||
info!("reached end of WAL at {}", last_lsn);
|
||||
} else {
|
||||
info!("there was no WAL to import at {}", last_lsn);
|
||||
}
|
||||
|
||||
// Log any extra unused files
|
||||
for e in &mut pg_wal_entries_iter {
|
||||
let entry = e?;
|
||||
let header = entry.header();
|
||||
let file_path = header.path()?.into_owned();
|
||||
info!("skipping {:?}", file_path);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn import_file<R: Repository, Reader: Read>(
|
||||
modification: &mut DatadirModification<R>,
|
||||
file_path: &Path,
|
||||
reader: Reader,
|
||||
len: usize,
|
||||
) -> Result<Option<ControlFileData>> {
|
||||
debug!("looking at {:?}", file_path);
|
||||
|
||||
if file_path.starts_with("global") {
|
||||
let spcnode = pg_constants::GLOBALTABLESPACE_OID;
|
||||
let dbnode = 0;
|
||||
|
||||
match file_path
|
||||
.file_name()
|
||||
.expect("missing filename")
|
||||
.to_string_lossy()
|
||||
.as_ref()
|
||||
{
|
||||
"pg_control" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&bytes[..])?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
||||
modification.put_checkpoint(checkpoint_bytes)?;
|
||||
debug!("imported control file");
|
||||
|
||||
// Import it as ControlFile
|
||||
modification.put_control_file(bytes)?;
|
||||
return Ok(Some(pg_control));
|
||||
}
|
||||
"pg_filenode.map" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
debug!("ignored");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||
debug!("imported rel creation");
|
||||
}
|
||||
}
|
||||
} else if file_path.starts_with("base") {
|
||||
let spcnode = pg_constants::DEFAULTTABLESPACE_OID;
|
||||
let dbnode: u32 = file_path
|
||||
.iter()
|
||||
.nth(1)
|
||||
.expect("invalid file path, expected dbnode")
|
||||
.to_string_lossy()
|
||||
.parse()?;
|
||||
|
||||
match file_path
|
||||
.file_name()
|
||||
.expect("missing base filename")
|
||||
.to_string_lossy()
|
||||
.as_ref()
|
||||
{
|
||||
"pg_filenode.map" => {
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_relmap_file(spcnode, dbnode, bytes)?;
|
||||
debug!("imported relmap file")
|
||||
}
|
||||
"PG_VERSION" => {
|
||||
debug!("ignored");
|
||||
}
|
||||
_ => {
|
||||
import_rel(modification, file_path, spcnode, dbnode, reader, len)?;
|
||||
debug!("imported rel creation");
|
||||
}
|
||||
}
|
||||
} else if file_path.starts_with("pg_xact") {
|
||||
let slru = SlruKind::Clog;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len)?;
|
||||
debug!("imported clog slru");
|
||||
} else if file_path.starts_with("pg_multixact/offsets") {
|
||||
let slru = SlruKind::MultiXactOffsets;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len)?;
|
||||
debug!("imported multixact offsets slru");
|
||||
} else if file_path.starts_with("pg_multixact/members") {
|
||||
let slru = SlruKind::MultiXactMembers;
|
||||
|
||||
import_slru(modification, slru, file_path, reader, len)?;
|
||||
debug!("imported multixact members slru");
|
||||
} else if file_path.starts_with("pg_twophase") {
|
||||
let file_name = &file_path
|
||||
.file_name()
|
||||
.expect("missing twophase filename")
|
||||
.to_string_lossy();
|
||||
let xid = u32::from_str_radix(file_name, 16)?;
|
||||
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
modification.put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))?;
|
||||
debug!("imported twophase file");
|
||||
} else if file_path.starts_with("pg_wal") {
|
||||
debug!("found wal file in base section. ignore it");
|
||||
} else if file_path.starts_with("zenith.signal") {
|
||||
// Parse zenith signal file to set correct previous LSN
|
||||
let bytes = read_all_bytes(reader)?;
|
||||
// zenith.signal format is "PREV LSN: prev_lsn"
|
||||
// TODO write serialization and deserialization in the same place.
|
||||
let zenith_signal = std::str::from_utf8(&bytes)?.trim();
|
||||
let prev_lsn = match zenith_signal {
|
||||
"PREV LSN: none" => Lsn(0),
|
||||
"PREV LSN: invalid" => Lsn(0),
|
||||
other => {
|
||||
let split = other.split(':').collect::<Vec<_>>();
|
||||
split[1]
|
||||
.trim()
|
||||
.parse::<Lsn>()
|
||||
.context("can't parse zenith.signal")?
|
||||
}
|
||||
};
|
||||
|
||||
// zenith.signal is not necessarily the last file, that we handle
|
||||
// but it is ok to call `finish_write()`, because final `modification.commit()`
|
||||
// will update lsn once more to the final one.
|
||||
let writer = modification.tline.tline.writer();
|
||||
writer.finish_write(prev_lsn);
|
||||
|
||||
debug!("imported zenith signal {}", prev_lsn);
|
||||
} else if file_path.starts_with("pg_tblspc") {
|
||||
// TODO Backups exported from neon won't have pg_tblspc, but we will need
|
||||
// this to import arbitrary postgres databases.
|
||||
bail!("Importing pg_tblspc is not implemented");
|
||||
} else {
|
||||
debug!("ignored");
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn read_all_bytes<Reader: Read>(mut reader: Reader) -> Result<Bytes> {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
reader.read_to_end(&mut buf)?;
|
||||
Ok(Bytes::copy_from_slice(&buf[..]))
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,7 +23,6 @@ distribution depends on the workload: the updates could be totally random, or
|
||||
there could be a long stream of updates to a single relation when data is bulk
|
||||
loaded, for example, or something in between.
|
||||
|
||||
```
|
||||
Cloud Storage Page Server Safekeeper
|
||||
L1 L0 Memory WAL
|
||||
|
||||
@@ -38,7 +37,6 @@ Cloud Storage Page Server Safekeeper
|
||||
+----+----+ +----+----+ | | |
|
||||
|EEEE| |EEEE|EEEE| +---+-----+
|
||||
+----+ +----+----+
|
||||
```
|
||||
|
||||
In this illustration, WAL is received as a stream from the Safekeeper, from the
|
||||
right. It is immediately captured by the page server and stored quickly in
|
||||
@@ -49,7 +47,7 @@ the same page and relation close to each other.
|
||||
From the page server memory, whenever enough WAL has been accumulated, it is flushed
|
||||
to disk into a new L0 layer file, and the memory is released.
|
||||
|
||||
When enough L0 files have been accumulated, they are merged together and sliced
|
||||
When enough L0 files have been accumulated, they are merged together rand sliced
|
||||
per key-space, producing a new set of files where each file contains a more
|
||||
narrow key range, but larger LSN range.
|
||||
|
||||
@@ -123,7 +121,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
|
||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||
timeline are stored in the timeline's subdirectory under
|
||||
`.zenith/tenants/<tenantid>/timelines`.
|
||||
.zenith/tenants/<tenantid>/timelines.
|
||||
|
||||
There are two kind of layer files: images, and delta layers. An image file
|
||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||
@@ -132,11 +130,8 @@ range of LSN.
|
||||
|
||||
image file:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
start key end key LSN
|
||||
```
|
||||
|
||||
|
||||
The first parts define the key range that the layer covers. See
|
||||
pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
|
||||
@@ -145,10 +140,8 @@ delta file:
|
||||
|
||||
Delta files are named similarly, but they cover a range of LSNs:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
start key end key start LSN end LSN
|
||||
```
|
||||
|
||||
A delta file contains all the key-values in the key-range that were updated in
|
||||
the LSN range. If a key has not been modified, there is no trace of it in the
|
||||
@@ -158,9 +151,7 @@ delta layer.
|
||||
A delta layer file can cover a part of the overall key space, as in the previous
|
||||
example, or the whole key range like this:
|
||||
|
||||
```
|
||||
000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
|
||||
```
|
||||
|
||||
A file that covers the whole key range is called a L0 file (Level 0), while a
|
||||
file that covers only part of the key range is called a L1 file. The "level" of
|
||||
@@ -177,9 +168,7 @@ version, and how branching and GC works is still valid.
|
||||
|
||||
The full path of a delta file looks like this:
|
||||
|
||||
```
|
||||
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||
```
|
||||
|
||||
For simplicity, the examples below use a simplified notation for the
|
||||
paths. The tenant ID is left out, the timeline ID is replaced with
|
||||
@@ -188,10 +177,8 @@ with a human-readable table name. The LSNs are also shorter. For
|
||||
example, a base image file at LSN 100 and a delta file between 100-200
|
||||
for 'orders' table on 'main' branch is represented like this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
```
|
||||
|
||||
|
||||
# Creating layer files
|
||||
@@ -201,14 +188,12 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end
|
||||
of WAL is currently at LSN 250. In this starting situation, you would
|
||||
have these files on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
In addition to those files, the recent changes between LSN 200 and the
|
||||
end of WAL at 250 are kept in memory. If the page server crashes, the
|
||||
@@ -239,7 +224,6 @@ If the customers table is modified later, a new file is created for it
|
||||
at the next checkpoint. The new file will cover the "gap" from the
|
||||
last layer file, so the LSN ranges are always contiguous:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -252,7 +236,6 @@ last layer file, so the LSN ranges are always contiguous:
|
||||
main/customers_200
|
||||
main/customers_200_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
## Reading page versions
|
||||
|
||||
@@ -276,18 +259,15 @@ involves replaying any WAL records applicable to the page between LSNs
|
||||
|
||||
Imagine that a child branch is created at LSN 250:
|
||||
|
||||
```
|
||||
@250
|
||||
----main--+-------------------------->
|
||||
\
|
||||
+---child-------------->
|
||||
```
|
||||
|
||||
|
||||
Then, the 'orders' table is updated differently on the 'main' and
|
||||
'child' branches. You now have this situation on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -302,7 +282,6 @@ Then, the 'orders' table is updated differently on the 'main' and
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
Because the 'customers' table hasn't been modified on the child
|
||||
branch, there is no file for it there. If you request a page for it on
|
||||
@@ -315,7 +294,6 @@ is linear, and the request's LSN identifies unambiguously which file
|
||||
you need to look at. For example, the history for the 'orders' table
|
||||
on the 'main' branch consists of these files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -323,12 +301,10 @@ on the 'main' branch consists of these files:
|
||||
main/orders_300
|
||||
main/orders_300_400
|
||||
main/orders_400
|
||||
```
|
||||
|
||||
And from the 'child' branch's point of view, it consists of these
|
||||
files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -337,7 +313,6 @@ files:
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
The branch metadata includes the point where the child branch was
|
||||
created, LSN 250. If a page request comes with LSN 275, we read the
|
||||
@@ -370,7 +345,6 @@ Let's look at the single branch scenario again. Imagine that the end
|
||||
of the branch is LSN 525, so that the GC horizon is currently at
|
||||
525-150 = 375
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -383,13 +357,11 @@ of the branch is LSN 525, so that the GC horizon is currently at
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
We can remove the following files because the end LSNs of those files are
|
||||
older than GC horizon 375, and there are more recent layer files for the
|
||||
table:
|
||||
|
||||
```
|
||||
main/orders_100 DELETE
|
||||
main/orders_100_200 DELETE
|
||||
main/orders_200 DELETE
|
||||
@@ -402,9 +374,8 @@ table:
|
||||
main/customers_100 DELETE
|
||||
main/customers_100_200 DELETE
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
'main/customers_200' is old enough, but it cannot be
|
||||
'main/customers_100_200' is old enough, but it cannot be
|
||||
removed because there is no newer layer file for the table.
|
||||
|
||||
Things get slightly more complicated with multiple branches. All of
|
||||
@@ -413,7 +384,6 @@ retain older shapshot files that are still needed by child branches.
|
||||
For example, if child branch is created at LSN 150, and the 'customers'
|
||||
table is updated on the branch, you would have these files:
|
||||
|
||||
```
|
||||
main/orders_100 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_100_200 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_200 DELETE
|
||||
@@ -428,7 +398,6 @@ table is updated on the branch, you would have these files:
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
child/customers_150_300 DELETE
|
||||
child/customers_300 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
In this situation, 'main/orders_100' and 'main/orders_100_200' cannot
|
||||
be removed, even though they are older than the GC horizon, because
|
||||
@@ -438,7 +407,6 @@ and 'main/orders_200_300' can still be removed.
|
||||
If 'orders' is modified later on the 'child' branch, we will create a
|
||||
new base image and delta file for it on the child:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
|
||||
@@ -451,7 +419,6 @@ new base image and delta file for it on the child:
|
||||
child/customers_300
|
||||
child/orders_150_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
After this, the 'main/orders_100' and 'main/orders_100_200' file could
|
||||
be removed. It is no longer needed by the child branch, because there
|
||||
@@ -467,7 +434,6 @@ Describe GC and checkpoint interval settings.
|
||||
In principle, each relation can be checkpointed separately, i.e. the
|
||||
LSN ranges of the files don't need to line up. So this would be legal:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -480,7 +446,6 @@ LSN ranges of the files don't need to line up. So this would be legal:
|
||||
main/customers_250
|
||||
main/customers_250_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
However, the code currently always checkpoints all relations together.
|
||||
So that situation doesn't arise in practice.
|
||||
@@ -503,13 +468,11 @@ does that. It could be useful, however, as a transient state when
|
||||
garbage collecting around branch points, or explicit recovery
|
||||
points. For example, if we start with this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/orders_200_300
|
||||
main/orders_300
|
||||
```
|
||||
|
||||
And there is a branch or explicit recovery point at LSN 150, we could
|
||||
replace 'main/orders_100_200' with 'main/orders_150' to keep a
|
||||
|
||||
@@ -1,20 +1,12 @@
|
||||
//!
|
||||
//! Functions for reading and writing variable-sized "blobs".
|
||||
//!
|
||||
//! Each blob begins with a 1- or 4-byte length field, followed by the
|
||||
//! actual data. If the length is smaller than 128 bytes, the length
|
||||
//! is written as a one byte. If it's larger than that, the length
|
||||
//! is written as a four-byte integer, in big-endian, with the high
|
||||
//! bit set. This way, we can detect whether it's 1- or 4-byte header
|
||||
//! by peeking at the first byte.
|
||||
//!
|
||||
//! len < 128: 0XXXXXXX
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//! Each blob begins with a 4-byte length, followed by the actual data.
|
||||
//!
|
||||
use crate::layered_repository::block_io::{BlockCursor, BlockReader};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::io::Error;
|
||||
|
||||
/// For reading
|
||||
pub trait BlobCursor {
|
||||
@@ -34,7 +26,7 @@ pub trait BlobCursor {
|
||||
) -> Result<(), std::io::Error>;
|
||||
}
|
||||
|
||||
impl<R> BlobCursor for BlockCursor<R>
|
||||
impl<'a, R> BlobCursor for BlockCursor<R>
|
||||
where
|
||||
R: BlockReader,
|
||||
{
|
||||
@@ -48,30 +40,21 @@ where
|
||||
|
||||
let mut buf = self.read_blk(blknum)?;
|
||||
|
||||
// peek at the first byte, to determine if it's a 1- or 4-byte length
|
||||
let first_len_byte = buf[off];
|
||||
let len: usize = if first_len_byte < 0x80 {
|
||||
// 1-byte length header
|
||||
off += 1;
|
||||
first_len_byte as usize
|
||||
// read length
|
||||
let mut len_buf = [0u8; 4];
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it is split across two pages
|
||||
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum)?;
|
||||
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
// 4-byte length header
|
||||
let mut len_buf = [0u8; 4];
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it is split across two pages
|
||||
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum)?;
|
||||
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||
off += 4;
|
||||
}
|
||||
len_buf[0] &= 0x7f;
|
||||
u32::from_be_bytes(len_buf) as usize
|
||||
};
|
||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||
off += 4;
|
||||
}
|
||||
let len = u32::from_ne_bytes(len_buf) as usize;
|
||||
|
||||
dstbuf.clear();
|
||||
|
||||
@@ -147,27 +130,10 @@ where
|
||||
{
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
||||
let offset = self.offset;
|
||||
|
||||
if srcbuf.len() < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
let len_buf = srcbuf.len() as u8;
|
||||
self.inner.write_all(&[len_buf])?;
|
||||
self.offset += 1;
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if srcbuf.len() > 0x7fff_ffff {
|
||||
return Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({} bytes)", srcbuf.len()),
|
||||
));
|
||||
}
|
||||
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
self.inner.write_all(&len_buf)?;
|
||||
self.offset += 4;
|
||||
}
|
||||
self.inner
|
||||
.write_all(&((srcbuf.len()) as u32).to_ne_bytes())?;
|
||||
self.inner.write_all(srcbuf)?;
|
||||
self.offset += srcbuf.len() as u64;
|
||||
self.offset += 4 + srcbuf.len() as u64;
|
||||
Ok(offset)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,10 +35,14 @@ use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::*;
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{Seek, SeekFrom};
|
||||
@@ -46,13 +50,9 @@ use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -216,17 +216,12 @@ impl Layer for DeltaLayer {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!(self.key_range.contains(&key));
|
||||
@@ -252,9 +247,6 @@ impl Layer for DeltaLayer {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
@@ -263,18 +255,8 @@ impl Layer for DeltaLayer {
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
let buf = cursor.read_blob(pos).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let buf = cursor.read_blob(pos)?;
|
||||
let val = Value::des(&buf)?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
@@ -305,10 +287,7 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
|
||||
let inner = match self.load() {
|
||||
Ok(inner) => inner,
|
||||
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
||||
};
|
||||
let inner = self.load().unwrap();
|
||||
|
||||
match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
@@ -363,28 +342,6 @@ impl Layer for DeltaLayer {
|
||||
tree_reader.dump()?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
@@ -393,10 +350,34 @@ impl Layer for DeltaLayer {
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
let mut desc = String::new();
|
||||
match cursor.read_blob(blob_ref.pos()) {
|
||||
Ok(buf) => {
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len()).unwrap();
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " READ ERROR: {}", err).unwrap();
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
@@ -421,28 +402,6 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
key_start: Key,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) -> PathBuf {
|
||||
let rand_string: String = rand::thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(8)
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timelineid, &tenantid).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.{}.temp",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
u64::from(lsn_range.end),
|
||||
rand_string
|
||||
))
|
||||
}
|
||||
|
||||
///
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
@@ -460,9 +419,7 @@ impl DeltaLayer {
|
||||
drop(inner);
|
||||
let inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(inner).with_context(|| {
|
||||
format!("Failed to load delta layer {}", self.path().display())
|
||||
})?;
|
||||
self.load_inner(inner)?;
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
@@ -630,8 +587,12 @@ impl DeltaLayerWriter {
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range);
|
||||
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.temp",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
u64::from(lsn_range.end)
|
||||
));
|
||||
let mut file = VirtualFile::create(&path)?;
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
||||
@@ -724,8 +685,6 @@ impl DeltaLayerWriter {
|
||||
}),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
file.sync_all()?;
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
//! - page-oriented
|
||||
//!
|
||||
//! TODO:
|
||||
//! - better errors (e.g. with thiserror?)
|
||||
//! - maybe something like an Adaptive Radix Tree would be more efficient?
|
||||
//! - the values stored by image and delta layers are offsets into the file,
|
||||
//! and they are in monotonically increasing order. Prefix compression would
|
||||
@@ -18,12 +19,11 @@
|
||||
//! - An Iterator interface would be more convenient for the callers than the
|
||||
//! 'visit' function
|
||||
//!
|
||||
use anyhow;
|
||||
use byteorder::{ReadBytesExt, BE};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use hex;
|
||||
use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::layered_repository::block_io::{BlockReader, BlockWriter};
|
||||
|
||||
@@ -86,23 +86,6 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum DiskBtreeError {
|
||||
#[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)]
|
||||
AppendOverflow(u64),
|
||||
|
||||
#[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")]
|
||||
UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> },
|
||||
|
||||
#[error("Could not push to new leaf node")]
|
||||
FailedToPushToNewLeafNode,
|
||||
|
||||
#[error("IoError: {0}")]
|
||||
Io(#[from] io::Error),
|
||||
}
|
||||
|
||||
pub type Result<T> = result::Result<T, DiskBtreeError>;
|
||||
|
||||
/// This is the on-disk representation.
|
||||
struct OnDiskNode<'a, const L: usize> {
|
||||
// Fixed-width fields
|
||||
@@ -123,12 +106,12 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
///
|
||||
/// Interpret a PAGE_SZ page as a node.
|
||||
///
|
||||
fn deparse(buf: &[u8]) -> Result<OnDiskNode<L>> {
|
||||
fn deparse(buf: &[u8]) -> OnDiskNode<L> {
|
||||
let mut cursor = std::io::Cursor::new(buf);
|
||||
let num_children = cursor.read_u16::<BE>()?;
|
||||
let level = cursor.read_u8()?;
|
||||
let prefix_len = cursor.read_u8()?;
|
||||
let suffix_len = cursor.read_u8()?;
|
||||
let num_children = cursor.read_u16::<BE>().unwrap();
|
||||
let level = cursor.read_u8().unwrap();
|
||||
let prefix_len = cursor.read_u8().unwrap();
|
||||
let suffix_len = cursor.read_u8().unwrap();
|
||||
|
||||
let mut off = cursor.position();
|
||||
let prefix_off = off as usize;
|
||||
@@ -146,7 +129,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
let keys = &buf[keys_off..keys_off + keys_len];
|
||||
let values = &buf[values_off..values_off + values_len];
|
||||
|
||||
Ok(OnDiskNode {
|
||||
OnDiskNode {
|
||||
num_children,
|
||||
level,
|
||||
prefix_len,
|
||||
@@ -154,7 +137,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
prefix,
|
||||
keys,
|
||||
values,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -166,11 +149,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
Value::from_slice(value_slice)
|
||||
}
|
||||
|
||||
fn binary_search(
|
||||
&self,
|
||||
search_key: &[u8; L],
|
||||
keybuf: &mut [u8],
|
||||
) -> result::Result<usize, usize> {
|
||||
fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result<usize, usize> {
|
||||
let mut size = self.num_children as usize;
|
||||
let mut low = 0;
|
||||
let mut high = size;
|
||||
@@ -230,7 +209,7 @@ where
|
||||
///
|
||||
/// Read the value for given key. Returns the value, or None if it doesn't exist.
|
||||
///
|
||||
pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
|
||||
pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result<Option<u64>> {
|
||||
let mut result: Option<u64> = None;
|
||||
self.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
if key == search_key {
|
||||
@@ -251,7 +230,7 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
mut visitor: V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
@@ -264,7 +243,7 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
@@ -281,11 +260,11 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
let node = OnDiskNode::deparse(node_buf)?;
|
||||
let node = OnDiskNode::deparse(node_buf);
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
@@ -390,15 +369,15 @@ where
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
pub fn dump(&self) -> anyhow::Result<()> {
|
||||
self.dump_recurse(self.root_blk, &[], 0)
|
||||
}
|
||||
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
let node = OnDiskNode::<L>::deparse(buf);
|
||||
|
||||
print!("{:indent$}", "", indent = depth * 2);
|
||||
println!(
|
||||
@@ -444,13 +423,6 @@ where
|
||||
///
|
||||
/// stack[0] is the current root page, stack.last() is the leaf.
|
||||
///
|
||||
/// We maintain the length of the stack to be always greater than zero.
|
||||
/// Two exceptions are:
|
||||
/// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
|
||||
/// So because other methods cannot see the intermediate state invariant still holds.
|
||||
/// 2. `Self::finish`. It consumes self and does not return it back,
|
||||
/// which means that this is where the structure is destroyed.
|
||||
/// Thus stack of zero length cannot be observed by other methods.
|
||||
stack: Vec<BuildNode<L>>,
|
||||
|
||||
/// Last key that was appended to the tree. Used to sanity check that append
|
||||
@@ -470,29 +442,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> {
|
||||
if value > MAX_VALUE {
|
||||
return Err(DiskBtreeError::AppendOverflow(value));
|
||||
}
|
||||
pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> {
|
||||
assert!(value <= MAX_VALUE);
|
||||
if let Some(last_key) = &self.last_key {
|
||||
if key <= last_key {
|
||||
return Err(DiskBtreeError::UnsortedInput {
|
||||
key: key.as_slice().into(),
|
||||
last_key: last_key.as_slice().into(),
|
||||
});
|
||||
}
|
||||
assert!(key > last_key, "unsorted input");
|
||||
}
|
||||
self.last_key = Some(*key);
|
||||
|
||||
self.append_internal(key, Value::from_u64(value))
|
||||
Ok(self.append_internal(key, Value::from_u64(value))?)
|
||||
}
|
||||
|
||||
fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> {
|
||||
fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> {
|
||||
// Try to append to the current leaf buffer
|
||||
let last = self
|
||||
.stack
|
||||
.last_mut()
|
||||
.expect("should always have at least one item");
|
||||
let last = self.stack.last_mut().unwrap();
|
||||
let level = last.level;
|
||||
if last.push(key, value) {
|
||||
return Ok(());
|
||||
@@ -514,33 +476,26 @@ where
|
||||
// key to it.
|
||||
let mut last = BuildNode::new(level);
|
||||
if !last.push(key, value) {
|
||||
return Err(DiskBtreeError::FailedToPushToNewLeafNode);
|
||||
panic!("could not push to new leaf node");
|
||||
}
|
||||
|
||||
self.stack.push(last);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush the bottommost node in the stack to disk. Appends a downlink to its parent,
|
||||
/// and recursively flushes the parent too, if it becomes full. If the root page becomes full,
|
||||
/// creates a new root page, increasing the height of the tree.
|
||||
fn flush_node(&mut self) -> Result<()> {
|
||||
// Get the current bottommost node in the stack and flush it to disk.
|
||||
let last = self
|
||||
.stack
|
||||
.pop()
|
||||
.expect("should always have at least one item");
|
||||
fn flush_node(&mut self) -> Result<(), std::io::Error> {
|
||||
let last = self.stack.pop().unwrap();
|
||||
let buf = last.pack();
|
||||
let downlink_key = last.first_key();
|
||||
let downlink_ptr = self.writer.write_blk(buf)?;
|
||||
|
||||
// Append the downlink to the parent. If there is no parent, ie. this was the root page,
|
||||
// create a new root page, increasing the height of the tree.
|
||||
// Append the downlink to the parent
|
||||
if self.stack.is_empty() {
|
||||
self.stack.push(BuildNode::new(last.level + 1));
|
||||
}
|
||||
self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))
|
||||
self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -550,16 +505,13 @@ where
|
||||
/// (In the image and delta layers, it is stored in the beginning of the file,
|
||||
/// in the summary header)
|
||||
///
|
||||
pub fn finish(mut self) -> Result<(u32, W)> {
|
||||
pub fn finish(mut self) -> Result<(u32, W), std::io::Error> {
|
||||
// flush all levels, except the root.
|
||||
while self.stack.len() > 1 {
|
||||
self.flush_node()?;
|
||||
}
|
||||
|
||||
let root = self
|
||||
.stack
|
||||
.first()
|
||||
.expect("by the check above we left one item there");
|
||||
let root = self.stack.first().unwrap();
|
||||
let buf = root.pack();
|
||||
let root_blknum = self.writer.write_blk(buf)?;
|
||||
|
||||
@@ -740,14 +692,14 @@ mod tests {
|
||||
impl BlockReader for TestDisk {
|
||||
type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::rc::Rc::new(buf))
|
||||
}
|
||||
}
|
||||
impl BlockWriter for &mut TestDisk {
|
||||
fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
|
||||
fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error> {
|
||||
let blknum = self.blocks.len();
|
||||
self.blocks.push(buf);
|
||||
Ok(blknum as u32)
|
||||
@@ -755,7 +707,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() -> Result<()> {
|
||||
fn basic() -> anyhow::Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
|
||||
|
||||
@@ -836,7 +788,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lots_of_keys() -> Result<()> {
|
||||
fn lots_of_keys() -> anyhow::Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
|
||||
|
||||
@@ -930,7 +882,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_data() -> Result<()> {
|
||||
fn random_data() -> anyhow::Result<()> {
|
||||
// Generate random keys with exponential distribution, to
|
||||
// exercise the prefix compression
|
||||
const NUM_KEYS: usize = 100000;
|
||||
@@ -975,27 +927,21 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "unsorted input")]
|
||||
fn unsorted_input() {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk);
|
||||
|
||||
let _ = writer.append(b"ba", 1);
|
||||
let _ = writer.append(b"bb", 2);
|
||||
let err = writer.append(b"aa", 3).expect_err("should've failed");
|
||||
match err {
|
||||
DiskBtreeError::UnsortedInput { key, last_key } => {
|
||||
assert_eq!(key.as_ref(), b"aa".as_slice());
|
||||
assert_eq!(last_key.as_ref(), b"bb".as_slice());
|
||||
}
|
||||
_ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"),
|
||||
}
|
||||
let _ = writer.append(b"aa", 3);
|
||||
}
|
||||
|
||||
///
|
||||
/// This test contains a particular data set, see disk_btree_test_data.rs
|
||||
///
|
||||
#[test]
|
||||
fn particular_data() -> Result<()> {
|
||||
fn particular_data() -> anyhow::Result<()> {
|
||||
// Build a tree from it
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
|
||||
|
||||
@@ -16,8 +16,8 @@ use std::io::{Error, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tracing::*;
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
@@ -199,24 +199,18 @@ impl BlobWriter for EphemeralFile {
|
||||
let mut buf = self.get_buf_for_write(blknum)?;
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
buf[off] = srcbuf.len() as u8;
|
||||
off += 1;
|
||||
let len_buf = u32::to_ne_bytes(srcbuf.len() as u32);
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
buf[off..off + 4].copy_from_slice(&len_buf);
|
||||
off += 4;
|
||||
}
|
||||
buf[off..off + 4].copy_from_slice(&len_buf);
|
||||
off += 4;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
@@ -235,13 +229,7 @@ impl BlobWriter for EphemeralFile {
|
||||
buf_remain = &buf_remain[this_blk_len..];
|
||||
}
|
||||
drop(buf);
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.size += 1;
|
||||
} else {
|
||||
self.size += 4;
|
||||
}
|
||||
self.size += srcbuf.len() as u64;
|
||||
self.size += 4 + srcbuf.len() as u64;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
@@ -256,31 +244,16 @@ impl Drop for EphemeralFile {
|
||||
// remove entry from the hash map
|
||||
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
|
||||
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
if let Err(e) = res {
|
||||
warn!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
// unlink file
|
||||
// FIXME: print error
|
||||
let _ = std::fs::remove_file(&self.file.path);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
file.path.display(),
|
||||
e
|
||||
),
|
||||
)),
|
||||
}
|
||||
file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
@@ -399,12 +372,6 @@ mod tests {
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
// also test with a large blobs
|
||||
for i in 0..100 {
|
||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
|
||||
@@ -30,11 +30,11 @@ use crate::layered_repository::storage_layer::{
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use hex;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
@@ -44,11 +44,8 @@ use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -126,10 +123,6 @@ impl Layer for ImageLayer {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
@@ -155,7 +148,6 @@ impl Layer for ImageLayer {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
@@ -242,22 +234,6 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
fname: &ImageFileName,
|
||||
) -> PathBuf {
|
||||
let rand_string: String = rand::thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(8)
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(format!("{}.{}.temp", fname, rand_string))
|
||||
}
|
||||
|
||||
///
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
@@ -275,9 +251,7 @@ impl ImageLayer {
|
||||
drop(inner);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(&mut inner).with_context(|| {
|
||||
format!("Failed to load image layer {}", self.path().display())
|
||||
})?
|
||||
self.load_inner(&mut inner)?;
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
@@ -415,7 +389,7 @@ impl ImageLayer {
|
||||
///
|
||||
pub struct ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
_path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
key_range: Range<Key>,
|
||||
@@ -433,10 +407,12 @@ impl ImageLayerWriter {
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
let path = ImageLayer::temp_path_for(
|
||||
conf,
|
||||
// Create the file
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = ImageLayer::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&ImageFileName {
|
||||
@@ -456,7 +432,7 @@ impl ImageLayerWriter {
|
||||
|
||||
let writer = ImageLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
_path: path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
key_range: key_range.clone(),
|
||||
@@ -527,25 +503,6 @@ impl ImageLayerWriter {
|
||||
index_root_blk,
|
||||
}),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
file.sync_all()?;
|
||||
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = ImageLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
std::fs::rename(self.path, &final_path)?;
|
||||
|
||||
trace!("created image layer {}", layer.path().display());
|
||||
|
||||
Ok(layer)
|
||||
|
||||
@@ -14,21 +14,19 @@ use crate::layered_repository::storage_layer::{
|
||||
};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
vec_map::VecMap,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -85,10 +83,6 @@ impl Layer for InMemoryLayer {
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
@@ -119,7 +113,7 @@ impl Layer for InMemoryLayer {
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
ensure!(lsn_range.start <= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
@@ -130,6 +124,13 @@ impl Layer for InMemoryLayer {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
match &reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
return Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
@@ -211,7 +212,7 @@ impl Layer for InMemoryLayer {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
@@ -267,13 +268,13 @@ impl InMemoryLayer {
|
||||
|
||||
/// Common subroutine of the public put_wal_record() and put_page_image() functions.
|
||||
/// Adds the page version to the in-memory tree
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
|
||||
pub fn put_value(&self, key: Key, lsn: Lsn, val: Value) -> Result<()> {
|
||||
trace!("put_value key {} at {}/{}", key, self.timelineid, lsn);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
|
||||
inner.assert_writeable();
|
||||
|
||||
let off = inner.file.write_blob(&Value::ser(val)?)?;
|
||||
let off = inner.file.write_blob(&Value::ser(&val)?)?;
|
||||
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
|
||||
|
||||
@@ -16,12 +16,12 @@ use crate::layered_repository::InMemoryLayer;
|
||||
use crate::repository::Key;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use metrics::{register_int_gauge, IntGauge};
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_ONDISK_LAYERS: IntGauge =
|
||||
@@ -43,13 +43,10 @@ pub struct LayerMap {
|
||||
pub next_open_layer_at: Option<Lsn>,
|
||||
|
||||
///
|
||||
/// Frozen layers, if any. Frozen layers are in-memory layers that
|
||||
/// are no longer added to, but haven't been written out to disk
|
||||
/// yet. They contain WAL older than the current 'open_layer' or
|
||||
/// 'next_open_layer_at', but newer than any historic layer.
|
||||
/// The frozen layers are in order from oldest to newest, so that
|
||||
/// the newest one is in the 'back' of the VecDeque, and the oldest
|
||||
/// in the 'front'.
|
||||
/// The frozen layer, if any, contains WAL older than the current 'open_layer'
|
||||
/// or 'next_open_layer_at', but newer than any historic layer. The frozen
|
||||
/// layer is during checkpointing, when an InMemoryLayer is being written out
|
||||
/// to disk.
|
||||
///
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
@@ -132,15 +129,17 @@ impl LayerMap {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
trace!(
|
||||
"found layer {} for request on {key} at {end_lsn}",
|
||||
"found layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
break;
|
||||
}
|
||||
// this layer's end LSN is smaller than the requested point. If there's
|
||||
// nothing newer, this is what we need to return. Remember this.
|
||||
if let Some(old_candidate) = &latest_delta {
|
||||
if let Some(ref old_candidate) = latest_delta {
|
||||
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
}
|
||||
@@ -150,8 +149,10 @@ impl LayerMap {
|
||||
}
|
||||
if let Some(l) = latest_delta {
|
||||
trace!(
|
||||
"found (old) layer {} for request on {key} at {end_lsn}",
|
||||
"found (old) layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
let lsn_floor = std::cmp::max(
|
||||
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||
@@ -162,13 +163,17 @@ impl LayerMap {
|
||||
layer: l,
|
||||
}))
|
||||
} else if let Some(l) = latest_img {
|
||||
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
|
||||
trace!(
|
||||
"found img layer and no deltas for request on {} at {}",
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn.unwrap(),
|
||||
layer: l,
|
||||
}))
|
||||
} else {
|
||||
trace!("no layer found for request on {key} at {end_lsn}");
|
||||
trace!("no layer found for request on {} at {}", key, end_lsn);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@@ -186,6 +191,7 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
#[allow(dead_code)]
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let len_before = self.historic_layers.len();
|
||||
|
||||
@@ -201,14 +207,18 @@ impl LayerMap {
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
/// Is there a newer image layer for given key- and LSN-range?
|
||||
/// Is there a newer image layer for given key-range?
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted.
|
||||
pub fn image_layer_exists(
|
||||
/// We ignore layers newer than disk_consistent_lsn because they will be removed at restart
|
||||
/// We also only look at historic layers
|
||||
//#[allow(dead_code)]
|
||||
pub fn newer_image_layer_exists(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
lsn: Lsn,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> Result<bool> {
|
||||
let mut range_remain = key_range.clone();
|
||||
|
||||
@@ -221,7 +231,8 @@ impl LayerMap {
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
if !l.is_incremental()
|
||||
&& l.get_key_range().contains(&range_remain.start)
|
||||
&& lsn_range.contains(&img_lsn)
|
||||
&& img_lsn > lsn
|
||||
&& img_lsn < disk_consistent_lsn
|
||||
{
|
||||
made_progress = true;
|
||||
let img_key_end = l.get_key_range().end;
|
||||
@@ -239,7 +250,7 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl Iterator<Item = &Arc<dyn Layer>> {
|
||||
pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
|
||||
self.historic_layers.iter()
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ use std::path::PathBuf;
|
||||
|
||||
use anyhow::ensure;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::{
|
||||
use zenith_utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
|
||||
@@ -4,15 +4,13 @@
|
||||
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
|
||||
where
|
||||
@@ -105,9 +103,6 @@ pub trait Layer: Send + Sync {
|
||||
/// log messages, even though they're never not on disk.)
|
||||
fn filename(&self) -> PathBuf;
|
||||
|
||||
/// If a layer has a corresponding file on a local filesystem, return its absolute path.
|
||||
fn local_path(&self) -> Option<PathBuf>;
|
||||
|
||||
///
|
||||
/// Return data needed to reconstruct given page at LSN.
|
||||
///
|
||||
|
||||
@@ -7,11 +7,9 @@ pub mod layered_repository;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod profiling;
|
||||
pub mod reltag;
|
||||
pub mod remote_storage;
|
||||
pub mod repository;
|
||||
pub mod storage_sync;
|
||||
pub mod tenant_config;
|
||||
pub mod tenant_mgr;
|
||||
pub mod tenant_threads;
|
||||
pub mod thread_mgr;
|
||||
@@ -24,10 +22,13 @@ pub mod walredo;
|
||||
|
||||
use lazy_static::lazy_static;
|
||||
use tracing::info;
|
||||
use utils::postgres_backend;
|
||||
use zenith_metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
use zenith_utils::{
|
||||
postgres_backend,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use metrics::{register_int_gauge_vec, IntGaugeVec};
|
||||
|
||||
use layered_repository::LayeredRepository;
|
||||
use pgdatadir_mapping::DatadirTimeline;
|
||||
@@ -45,7 +46,7 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
|
||||
|
||||
lazy_static! {
|
||||
static ref LIVE_CONNECTIONS_COUNT: IntGaugeVec = register_int_gauge_vec!(
|
||||
"pageserver_live_connections",
|
||||
"pageserver_live_connections_count",
|
||||
"Number of live network connections",
|
||||
&["pageserver_connection_kind"]
|
||||
)
|
||||
@@ -67,7 +68,7 @@ pub type RepositoryImpl = LayeredRepository;
|
||||
|
||||
pub type DatadirTimelineImpl = DatadirTimeline<RepositoryImpl>;
|
||||
|
||||
pub fn shutdown_pageserver(exit_code: i32) {
|
||||
pub fn shutdown_pageserver() {
|
||||
// Shut down the libpq endpoint thread. This prevents new connections from
|
||||
// being accepted.
|
||||
thread_mgr::shutdown_threads(Some(ThreadKind::LibpqEndpointListener), None, None);
|
||||
@@ -94,5 +95,5 @@ pub fn shutdown_pageserver(exit_code: i32) {
|
||||
thread_mgr::shutdown_threads(None, None, None);
|
||||
|
||||
info!("Shut down successfully completed");
|
||||
std::process::exit(exit_code);
|
||||
std::process::exit(0);
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@ use std::{
|
||||
|
||||
use once_cell::sync::OnceCell;
|
||||
use tracing::error;
|
||||
use utils::{
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
@@ -14,26 +14,25 @@ use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::{Buf, BufMut, Bytes, BytesMut};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use std::io::{self, Read};
|
||||
use std::io;
|
||||
use std::net::TcpListener;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::{Arc, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
auth::{self, Claims, JwtAuth, Scope},
|
||||
lsn::Lsn,
|
||||
postgres_backend::{self, is_socket_read_timed_out, AuthType, PostgresBackend},
|
||||
pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC},
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_metrics::{register_histogram_vec, HistogramVec};
|
||||
use zenith_utils::auth::{self, JwtAuth};
|
||||
use zenith_utils::auth::{Claims, Scope};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::is_socket_read_timed_out;
|
||||
use zenith_utils::postgres_backend::PostgresBackend;
|
||||
use zenith_utils::postgres_backend::{self, AuthType};
|
||||
use zenith_utils::pq_proto::{BeMessage, FeMessage, RowDescriptor, SINGLE_COL_ROWDESC};
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::basebackup;
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
use crate::import_datadir::{import_basebackup_from_tar, import_wal_from_tar};
|
||||
use crate::layered_repository::LayeredRepository;
|
||||
use crate::pgdatadir_mapping::{DatadirTimeline, LsnForTimestamp};
|
||||
use crate::profiling::profpoint_start;
|
||||
use crate::config::PageServerConf;
|
||||
use crate::pgdatadir_mapping::DatadirTimeline;
|
||||
use crate::reltag::RelTag;
|
||||
use crate::repository::Repository;
|
||||
use crate::repository::Timeline;
|
||||
@@ -42,17 +41,12 @@ use crate::thread_mgr;
|
||||
use crate::thread_mgr::ThreadKind;
|
||||
use crate::walreceiver;
|
||||
use crate::CheckpointConfig;
|
||||
use metrics::{register_histogram_vec, HistogramVec};
|
||||
use postgres_ffi::xlog_utils::to_pg_timestamp;
|
||||
|
||||
use postgres_ffi::pg_constants;
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
enum PagestreamFeMessage {
|
||||
Exists(PagestreamExistsRequest),
|
||||
Nblocks(PagestreamNblocksRequest),
|
||||
GetPage(PagestreamGetPageRequest),
|
||||
DbSize(PagestreamDbSizeRequest),
|
||||
}
|
||||
|
||||
// Wrapped in libpq CopyData
|
||||
@@ -61,7 +55,6 @@ enum PagestreamBeMessage {
|
||||
Nblocks(PagestreamNblocksResponse),
|
||||
GetPage(PagestreamGetPageResponse),
|
||||
Error(PagestreamErrorResponse),
|
||||
DbSize(PagestreamDbSizeResponse),
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -86,13 +79,6 @@ struct PagestreamGetPageRequest {
|
||||
blkno: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamDbSizeRequest {
|
||||
latest: bool,
|
||||
lsn: Lsn,
|
||||
dbnode: u32,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamExistsResponse {
|
||||
exists: bool,
|
||||
@@ -113,11 +99,6 @@ struct PagestreamErrorResponse {
|
||||
message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PagestreamDbSizeResponse {
|
||||
db_size: i64,
|
||||
}
|
||||
|
||||
impl PagestreamFeMessage {
|
||||
fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
|
||||
// TODO these gets can fail
|
||||
@@ -159,11 +140,6 @@ impl PagestreamFeMessage {
|
||||
},
|
||||
blkno: body.get_u32(),
|
||||
})),
|
||||
3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
|
||||
latest: body.get_u8() != 0,
|
||||
lsn: Lsn::from(body.get_u64()),
|
||||
dbnode: body.get_u32(),
|
||||
})),
|
||||
_ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
|
||||
}
|
||||
}
|
||||
@@ -194,106 +170,12 @@ impl PagestreamBeMessage {
|
||||
bytes.put(resp.message.as_bytes());
|
||||
bytes.put_u8(0); // null terminator
|
||||
}
|
||||
Self::DbSize(resp) => {
|
||||
bytes.put_u8(104); /* tag from pagestore_client.h */
|
||||
bytes.put_i64(resp.db_size);
|
||||
}
|
||||
}
|
||||
|
||||
bytes.into()
|
||||
}
|
||||
}
|
||||
|
||||
/// Implements Read for the server side of CopyIn
|
||||
struct CopyInReader<'a> {
|
||||
pgb: &'a mut PostgresBackend,
|
||||
|
||||
/// Overflow buffer for bytes sent in CopyData messages
|
||||
/// that the reader (caller of read) hasn't asked for yet.
|
||||
/// TODO use BytesMut?
|
||||
buf: Vec<u8>,
|
||||
|
||||
/// Bytes before `buf_begin` are considered as dropped.
|
||||
/// This allows us to implement O(1) pop_front on Vec<u8>.
|
||||
/// The Vec won't grow large because we only add to it
|
||||
/// when it's empty.
|
||||
buf_begin: usize,
|
||||
}
|
||||
|
||||
impl<'a> CopyInReader<'a> {
|
||||
// NOTE: pgb should be in copy in state already
|
||||
fn new(pgb: &'a mut PostgresBackend) -> Self {
|
||||
Self {
|
||||
pgb,
|
||||
buf: Vec::<_>::new(),
|
||||
buf_begin: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Drop for CopyInReader<'a> {
|
||||
fn drop(&mut self) {
|
||||
// Finalize copy protocol so that self.pgb can be reused
|
||||
// TODO instead, maybe take ownership of pgb and give it back at the end
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
let _ = self.read_to_end(&mut buf);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Read for CopyInReader<'a> {
|
||||
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
||||
while !thread_mgr::is_shutdown_requested() {
|
||||
// Return from buffer if nonempty
|
||||
if self.buf_begin < self.buf.len() {
|
||||
let bytes_to_read = std::cmp::min(buf.len(), self.buf.len() - self.buf_begin);
|
||||
buf[..bytes_to_read].copy_from_slice(&self.buf[self.buf_begin..][..bytes_to_read]);
|
||||
self.buf_begin += bytes_to_read;
|
||||
return Ok(bytes_to_read);
|
||||
}
|
||||
|
||||
// Delete garbage
|
||||
self.buf.clear();
|
||||
self.buf_begin = 0;
|
||||
|
||||
// Wait for client to send CopyData bytes
|
||||
match self.pgb.read_message() {
|
||||
Ok(Some(message)) => {
|
||||
let copy_data_bytes = match message {
|
||||
FeMessage::CopyData(bytes) => bytes,
|
||||
FeMessage::CopyDone => return Ok(0),
|
||||
FeMessage::Sync => continue,
|
||||
m => {
|
||||
let msg = format!("unexpected message {:?}", m);
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(&msg))?;
|
||||
return Err(io::Error::new(io::ErrorKind::Other, msg));
|
||||
}
|
||||
};
|
||||
|
||||
// Return as much as we can, saving the rest in self.buf
|
||||
let mut reader = copy_data_bytes.reader();
|
||||
let bytes_read = reader.read(buf)?;
|
||||
reader.read_to_end(&mut self.buf)?;
|
||||
return Ok(bytes_read);
|
||||
}
|
||||
Ok(None) => {
|
||||
let msg = "client closed connection";
|
||||
self.pgb.write_message(&BeMessage::ErrorResponse(msg))?;
|
||||
return Err(io::Error::new(io::ErrorKind::Other, msg));
|
||||
}
|
||||
Err(e) => {
|
||||
if !is_socket_read_timed_out(&e) {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, e));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Shutting down
|
||||
let msg = "Importer thread was shut down";
|
||||
Err(io::Error::new(io::ErrorKind::Other, msg))
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
///
|
||||
@@ -417,7 +299,7 @@ const TIME_BUCKETS: &[f64] = &[
|
||||
|
||||
lazy_static! {
|
||||
static ref SMGR_QUERY_TIME: HistogramVec = register_histogram_vec!(
|
||||
"pageserver_smgr_query_seconds",
|
||||
"pageserver_smgr_query_time",
|
||||
"Time spent on smgr query handling",
|
||||
&["smgr_query_type", "tenant_id", "timeline_id"],
|
||||
TIME_BUCKETS.into()
|
||||
@@ -434,7 +316,6 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::drop_non_drop)] // Only complains in CI, not sure why
|
||||
fn handle_pagerequests(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend,
|
||||
@@ -444,17 +325,14 @@ impl PageServerHandler {
|
||||
let _enter = info_span!("pagestream", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
/* switch client to COPYBOTH */
|
||||
pgb.write_message(&BeMessage::CopyBothResponse)?;
|
||||
|
||||
while !thread_mgr::is_shutdown_requested() {
|
||||
let msg = pgb.read_message();
|
||||
|
||||
let profiling_guard = profpoint_start(self.conf, ProfilingConfig::PageRequests);
|
||||
match msg {
|
||||
match pgb.read_message() {
|
||||
Ok(message) => {
|
||||
if let Some(message) = message {
|
||||
trace!("query: {:?}", message);
|
||||
@@ -484,11 +362,6 @@ impl PageServerHandler {
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_get_page_at_lsn_request(timeline.as_ref(), &req)
|
||||
}),
|
||||
PagestreamFeMessage::DbSize(req) => SMGR_QUERY_TIME
|
||||
.with_label_values(&["get_db_size", &tenant_id, &timeline_id])
|
||||
.observe_closure_duration(|| {
|
||||
self.handle_db_size_request(timeline.as_ref(), &req)
|
||||
}),
|
||||
};
|
||||
|
||||
let response = response.unwrap_or_else(|e| {
|
||||
@@ -511,101 +384,10 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
}
|
||||
drop(profiling_guard);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_import_basebackup(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
base_lsn: Lsn,
|
||||
_end_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter =
|
||||
info_span!("import basebackup", timeline = %timeline_id, tenant = %tenant_id).entered();
|
||||
|
||||
// Create empty timeline
|
||||
info!("creating new timeline");
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let timeline = repo.create_empty_timeline(timeline_id, base_lsn)?;
|
||||
let repartition_distance = repo.get_checkpoint_distance();
|
||||
let mut datadir_timeline =
|
||||
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
|
||||
|
||||
// TODO mark timeline as not ready until it reaches end_lsn.
|
||||
// We might have some wal to import as well, and we should prevent compute
|
||||
// from connecting before that and writing conflicting wal.
|
||||
//
|
||||
// This is not relevant for pageserver->pageserver migrations, since there's
|
||||
// no wal to import. But should be fixed if we want to import from postgres.
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
// up broken state from a failed import.
|
||||
|
||||
// Import basebackup provided via CopyData
|
||||
info!("importing basebackup");
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
let reader = CopyInReader::new(pgb);
|
||||
import_basebackup_from_tar(&mut datadir_timeline, reader, base_lsn)?;
|
||||
|
||||
// TODO check checksum
|
||||
// Meanwhile you can verify client-side by taking fullbackup
|
||||
// and checking that it matches in size with what was imported.
|
||||
// It wouldn't work if base came from vanilla postgres though,
|
||||
// since we discard some log files.
|
||||
|
||||
// Flush data to disk, then upload to s3
|
||||
info!("flushing layers");
|
||||
datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_import_wal(
|
||||
&self,
|
||||
pgb: &mut PostgresBackend,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
start_lsn: Lsn,
|
||||
end_lsn: Lsn,
|
||||
) -> anyhow::Result<()> {
|
||||
let _enter =
|
||||
info_span!("import wal", timeline = %timeline_id, tenant = %tenant_id).entered();
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let timeline = repo.get_timeline_load(timeline_id)?;
|
||||
ensure!(timeline.get_last_record_lsn() == start_lsn);
|
||||
|
||||
let repartition_distance = repo.get_checkpoint_distance();
|
||||
let mut datadir_timeline =
|
||||
DatadirTimeline::<LayeredRepository>::new(timeline, repartition_distance);
|
||||
|
||||
// TODO leave clean state on error. For now you can use detach to clean
|
||||
// up broken state from a failed import.
|
||||
|
||||
// Import wal provided via CopyData
|
||||
info!("importing wal");
|
||||
pgb.write_message(&BeMessage::CopyInResponse)?;
|
||||
let reader = CopyInReader::new(pgb);
|
||||
import_wal_from_tar(&mut datadir_timeline, reader, start_lsn, end_lsn)?;
|
||||
|
||||
// TODO Does it make sense to overshoot?
|
||||
ensure!(datadir_timeline.tline.get_last_record_lsn() >= end_lsn);
|
||||
|
||||
// Flush data to disk, then upload to s3. No need for a forced checkpoint.
|
||||
// We only want to persist the data, and it doesn't matter if it's in the
|
||||
// shape of deltas or images.
|
||||
info!("flushing layers");
|
||||
datadir_timeline.tline.checkpoint(CheckpointConfig::Flush)?;
|
||||
|
||||
info!("done");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper function to handle the LSN from client request.
|
||||
///
|
||||
/// Each GetPage (and Exists and Nblocks) request includes information about
|
||||
@@ -699,32 +481,6 @@ impl PageServerHandler {
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_db_size_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
req: &PagestreamDbSizeRequest,
|
||||
) -> Result<PagestreamBeMessage> {
|
||||
let _enter = info_span!("get_db_size", dbnode = %req.dbnode, req_lsn = %req.lsn).entered();
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)?;
|
||||
|
||||
let all_rels = timeline.list_rels(pg_constants::DEFAULTTABLESPACE_OID, req.dbnode, lsn)?;
|
||||
let mut total_blocks: i64 = 0;
|
||||
|
||||
for rel in all_rels {
|
||||
if rel.forknum == 0 {
|
||||
let n_blocks = timeline.get_rel_size(rel, lsn).unwrap_or(0);
|
||||
total_blocks += n_blocks as i64;
|
||||
}
|
||||
}
|
||||
|
||||
let db_size = total_blocks * pg_constants::BLCKSZ as i64;
|
||||
|
||||
Ok(PagestreamBeMessage::DbSize(PagestreamDbSizeResponse {
|
||||
db_size,
|
||||
}))
|
||||
}
|
||||
|
||||
fn handle_get_page_at_lsn_request<R: Repository>(
|
||||
&self,
|
||||
timeline: &DatadirTimeline<R>,
|
||||
@@ -761,7 +517,7 @@ impl PageServerHandler {
|
||||
info!("starting");
|
||||
|
||||
// check that the timeline exists
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
let latest_gc_cutoff_lsn = timeline.tline.get_latest_gc_cutoff_lsn();
|
||||
if let Some(lsn) = lsn {
|
||||
@@ -776,7 +532,7 @@ impl PageServerHandler {
|
||||
/* Send a tarball of the latest layer on the timeline */
|
||||
{
|
||||
let mut writer = CopyDataSink { pgb };
|
||||
let basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
||||
let mut basebackup = basebackup::Basebackup::new(&mut writer, &timeline, lsn)?;
|
||||
span.record("lsn", &basebackup.lsn.to_string().as_str());
|
||||
basebackup.send_tarball()?;
|
||||
}
|
||||
@@ -895,125 +651,27 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
info_span!("callmemaybe", timeline = %timelineid, tenant = %tenantid).entered();
|
||||
|
||||
// Check that the timeline exists
|
||||
tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
walreceiver::launch_wal_receiver(self.conf, tenantid, timelineid, &connstr)?;
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("import basebackup ") {
|
||||
// Import the `base` section (everything but the wal) of a basebackup.
|
||||
// Assumes the tenant already exists on this pageserver.
|
||||
//
|
||||
// Files are scheduled to be persisted to remote storage, and the
|
||||
// caller should poll the http api to check when that is done.
|
||||
//
|
||||
// Example import command:
|
||||
// 1. Get start/end LSN from backup_manifest file
|
||||
// 2. Run:
|
||||
// cat my_backup/base.tar | psql -h $PAGESERVER \
|
||||
// -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN"
|
||||
let (_, params_raw) = query_string.split_at("import basebackup ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
ensure!(params.len() == 4);
|
||||
let tenant = ZTenantId::from_str(params[0])?;
|
||||
let timeline = ZTimelineId::from_str(params[1])?;
|
||||
let base_lsn = Lsn::from_str(params[2])?;
|
||||
let end_lsn = Lsn::from_str(params[3])?;
|
||||
|
||||
self.check_permission(Some(tenant))?;
|
||||
|
||||
match self.handle_import_basebackup(pgb, tenant, timeline, base_lsn, end_lsn) {
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||
}
|
||||
};
|
||||
} else if query_string.starts_with("import wal ") {
|
||||
// Import the `pg_wal` section of a basebackup.
|
||||
//
|
||||
// Files are scheduled to be persisted to remote storage, and the
|
||||
// caller should poll the http api to check when that is done.
|
||||
let (_, params_raw) = query_string.split_at("import wal ".len());
|
||||
let params = params_raw.split_whitespace().collect::<Vec<_>>();
|
||||
ensure!(params.len() == 4);
|
||||
let tenant = ZTenantId::from_str(params[0])?;
|
||||
let timeline = ZTimelineId::from_str(params[1])?;
|
||||
let start_lsn = Lsn::from_str(params[2])?;
|
||||
let end_lsn = Lsn::from_str(params[3])?;
|
||||
|
||||
self.check_permission(Some(tenant))?;
|
||||
|
||||
match self.handle_import_wal(pgb, tenant, timeline, start_lsn, end_lsn) {
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(&e.to_string()))?
|
||||
}
|
||||
};
|
||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("failpoints ") {
|
||||
ensure!(fail::has_failpoints(), "Cannot manage failpoints because pageserver was compiled without failpoints support");
|
||||
|
||||
let (_, failpoints) = query_string.split_at("failpoints ".len());
|
||||
|
||||
for failpoint in failpoints.split(';') {
|
||||
if let Some((name, actions)) = failpoint.split_once('=') {
|
||||
info!("cfg failpoint: {} {}", name, actions);
|
||||
|
||||
// We recognize one extra "action" that's not natively recognized
|
||||
// by the failpoints crate: exit, to immediately kill the process
|
||||
if actions == "exit" {
|
||||
fail::cfg_callback(name, || {
|
||||
info!("Exit requested by failpoint");
|
||||
std::process::exit(1);
|
||||
})
|
||||
.unwrap();
|
||||
} else {
|
||||
fail::cfg(name, actions).unwrap();
|
||||
}
|
||||
fail::cfg(name, actions).unwrap();
|
||||
} else {
|
||||
bail!("Invalid failpoints format");
|
||||
}
|
||||
}
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("show ") {
|
||||
// show <tenant_id>
|
||||
let (_, params_raw) = query_string.split_at("show ".len());
|
||||
let params = params_raw.split(' ').collect::<Vec<_>>();
|
||||
ensure!(params.len() == 1, "invalid param number for config command");
|
||||
let tenantid = ZTenantId::from_str(params[0])?;
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"checkpoint_distance"),
|
||||
RowDescriptor::int8_col(b"compaction_target_size"),
|
||||
RowDescriptor::int8_col(b"compaction_period"),
|
||||
RowDescriptor::int8_col(b"compaction_threshold"),
|
||||
RowDescriptor::int8_col(b"gc_horizon"),
|
||||
RowDescriptor::int8_col(b"gc_period"),
|
||||
RowDescriptor::int8_col(b"image_creation_threshold"),
|
||||
RowDescriptor::int8_col(b"pitr_interval"),
|
||||
]))?
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(repo.get_checkpoint_distance().to_string().as_bytes()),
|
||||
Some(repo.get_compaction_target_size().to_string().as_bytes()),
|
||||
Some(
|
||||
repo.get_compaction_period()
|
||||
.as_secs()
|
||||
.to_string()
|
||||
.as_bytes(),
|
||||
),
|
||||
Some(repo.get_compaction_threshold().to_string().as_bytes()),
|
||||
Some(repo.get_gc_horizon().to_string().as_bytes()),
|
||||
Some(repo.get_gc_period().as_secs().to_string().as_bytes()),
|
||||
Some(repo.get_image_creation_threshold().to_string().as_bytes()),
|
||||
Some(repo.get_pitr_interval().as_secs().to_string().as_bytes()),
|
||||
]))?
|
||||
.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("do_gc ") {
|
||||
// Run GC immediately on given timeline.
|
||||
// FIXME: This is just for tests. See test_runner/batch_others/test_gc.py.
|
||||
@@ -1031,22 +689,16 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
|
||||
let gc_horizon: u64 = caps
|
||||
.get(4)
|
||||
.map(|h| h.as_str().parse())
|
||||
.unwrap_or_else(|| Ok(repo.get_gc_horizon()))?;
|
||||
.unwrap_or(Ok(self.conf.gc_horizon))?;
|
||||
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenantid)?;
|
||||
// Use tenant's pitr setting
|
||||
let pitr = repo.get_pitr_interval();
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, pitr, true)?;
|
||||
let result = repo.gc_iteration(Some(timelineid), gc_horizon, true)?;
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[
|
||||
RowDescriptor::int8_col(b"layers_total"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_cutoff"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_pitr"),
|
||||
RowDescriptor::int8_col(b"layers_needed_by_branches"),
|
||||
RowDescriptor::int8_col(b"layers_not_updated"),
|
||||
RowDescriptor::int8_col(b"layers_removed"),
|
||||
@@ -1055,7 +707,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
.write_message_noflush(&BeMessage::DataRow(&[
|
||||
Some(result.layers_total.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_cutoff.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_pitr.to_string().as_bytes()),
|
||||
Some(result.layers_needed_by_branches.to_string().as_bytes()),
|
||||
Some(result.layers_not_updated.to_string().as_bytes()),
|
||||
Some(result.layers_removed.to_string().as_bytes()),
|
||||
@@ -1076,7 +727,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Couldn't load timeline")?;
|
||||
timeline.tline.compact()?;
|
||||
|
||||
@@ -1095,7 +746,7 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
let timeline = tenant_mgr::get_timeline_for_tenant_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
timeline.tline.checkpoint(CheckpointConfig::Forced)?;
|
||||
@@ -1108,34 +759,6 @@ impl postgres_backend::Handler for PageServerHandler {
|
||||
|
||||
pgb.write_message_noflush(&SINGLE_COL_ROWDESC)?
|
||||
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("get_lsn_by_timestamp ") {
|
||||
// Locate LSN of last transaction with timestamp less or equal than sppecified
|
||||
// TODO lazy static
|
||||
let re = Regex::new(r"^get_lsn_by_timestamp ([[:xdigit:]]+) ([[:xdigit:]]+) '(.*)'$")
|
||||
.unwrap();
|
||||
let caps = re
|
||||
.captures(query_string)
|
||||
.with_context(|| format!("invalid get_lsn_by_timestamp: '{}'", query_string))?;
|
||||
|
||||
let tenantid = ZTenantId::from_str(caps.get(1).unwrap().as_str())?;
|
||||
let timelineid = ZTimelineId::from_str(caps.get(2).unwrap().as_str())?;
|
||||
let timeline = tenant_mgr::get_local_timeline_with_load(tenantid, timelineid)
|
||||
.context("Cannot load local timeline")?;
|
||||
|
||||
let timestamp = humantime::parse_rfc3339(caps.get(3).unwrap().as_str())?;
|
||||
let timestamp_pg = to_pg_timestamp(timestamp);
|
||||
|
||||
pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
|
||||
b"lsn",
|
||||
)]))?;
|
||||
let result = match timeline.find_lsn_for_timestamp(timestamp_pg)? {
|
||||
LsnForTimestamp::Present(lsn) => format!("{}", lsn),
|
||||
LsnForTimestamp::Future(_lsn) => "future".into(),
|
||||
LsnForTimestamp::Past(_lsn) => "past".into(),
|
||||
LsnForTimestamp::NoData(_lsn) => "nodata".into(),
|
||||
};
|
||||
pgb.write_message_noflush(&BeMessage::DataRow(&[Some(result.as_bytes())]))?;
|
||||
pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else {
|
||||
bail!("unknown command");
|
||||
}
|
||||
|
||||
@@ -13,7 +13,6 @@ use crate::repository::{Repository, Timeline};
|
||||
use crate::walrecord::ZenithWalRecord;
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use bytes::{Buf, Bytes};
|
||||
use postgres_ffi::xlog_utils::TimestampTz;
|
||||
use postgres_ffi::{pg_constants, Oid, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::{HashMap, HashSet};
|
||||
@@ -21,7 +20,8 @@ use std::ops::Range;
|
||||
use std::sync::atomic::{AtomicIsize, Ordering};
|
||||
use std::sync::{Arc, Mutex, RwLockReadGuard};
|
||||
use tracing::{debug, error, trace, warn};
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
|
||||
pub type BlockNumber = u32;
|
||||
@@ -46,14 +46,6 @@ where
|
||||
current_logical_size: AtomicIsize,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
Present(Lsn),
|
||||
Future(Lsn),
|
||||
Past(Lsn),
|
||||
NoData(Lsn),
|
||||
}
|
||||
|
||||
impl<R: Repository> DatadirTimeline<R> {
|
||||
pub fn new(tline: Arc<R::Timeline>, repartition_threshold: u64) -> Self {
|
||||
DatadirTimeline {
|
||||
@@ -211,106 +203,6 @@ impl<R: Repository> DatadirTimeline<R> {
|
||||
Ok(exists)
|
||||
}
|
||||
|
||||
/// Locate LSN, such that all transactions that committed before
|
||||
/// 'search_timestamp' are visible, but nothing newer is.
|
||||
///
|
||||
/// This is not exact. Commit timestamps are not guaranteed to be ordered,
|
||||
/// so it's not well defined which LSN you get if there were multiple commits
|
||||
/// "in flight" at that point in time.
|
||||
///
|
||||
pub fn find_lsn_for_timestamp(&self, search_timestamp: TimestampTz) -> Result<LsnForTimestamp> {
|
||||
let gc_cutoff_lsn_guard = self.tline.get_latest_gc_cutoff_lsn();
|
||||
let min_lsn = *gc_cutoff_lsn_guard;
|
||||
let max_lsn = self.tline.get_last_record_lsn();
|
||||
|
||||
// LSNs are always 8-byte aligned. low/mid/high represent the
|
||||
// LSN divided by 8.
|
||||
let mut low = min_lsn.0 / 8;
|
||||
let mut high = max_lsn.0 / 8 + 1;
|
||||
|
||||
let mut found_smaller = false;
|
||||
let mut found_larger = false;
|
||||
while low < high {
|
||||
// cannot overflow, high and low are both smaller than u64::MAX / 2
|
||||
let mid = (high + low) / 2;
|
||||
|
||||
let cmp = self.is_latest_commit_timestamp_ge_than(
|
||||
search_timestamp,
|
||||
Lsn(mid * 8),
|
||||
&mut found_smaller,
|
||||
&mut found_larger,
|
||||
)?;
|
||||
|
||||
if cmp {
|
||||
high = mid;
|
||||
} else {
|
||||
low = mid + 1;
|
||||
}
|
||||
}
|
||||
match (found_smaller, found_larger) {
|
||||
(false, false) => {
|
||||
// This can happen if no commit records have been processed yet, e.g.
|
||||
// just after importing a cluster.
|
||||
Ok(LsnForTimestamp::NoData(max_lsn))
|
||||
}
|
||||
(true, false) => {
|
||||
// Didn't find any commit timestamps larger than the request
|
||||
Ok(LsnForTimestamp::Future(max_lsn))
|
||||
}
|
||||
(false, true) => {
|
||||
// Didn't find any commit timestamps smaller than the request
|
||||
Ok(LsnForTimestamp::Past(max_lsn))
|
||||
}
|
||||
(true, true) => {
|
||||
// low is the LSN of the first commit record *after* the search_timestamp,
|
||||
// Back off by one to get to the point just before the commit.
|
||||
//
|
||||
// FIXME: it would be better to get the LSN of the previous commit.
|
||||
// Otherwise, if you restore to the returned LSN, the database will
|
||||
// include physical changes from later commits that will be marked
|
||||
// as aborted, and will need to be vacuumed away.
|
||||
Ok(LsnForTimestamp::Present(Lsn((low - 1) * 8)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Subroutine of find_lsn_for_timestamp(). Returns true, if there are any
|
||||
/// commits that committed after 'search_timestamp', at LSN 'probe_lsn'.
|
||||
///
|
||||
/// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
|
||||
/// with a smaller/larger timestamp.
|
||||
///
|
||||
fn is_latest_commit_timestamp_ge_than(
|
||||
&self,
|
||||
search_timestamp: TimestampTz,
|
||||
probe_lsn: Lsn,
|
||||
found_smaller: &mut bool,
|
||||
found_larger: &mut bool,
|
||||
) -> Result<bool> {
|
||||
for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn)? {
|
||||
let nblocks = self.get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)?;
|
||||
for blknum in (0..nblocks).rev() {
|
||||
let clog_page =
|
||||
self.get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)?;
|
||||
|
||||
if clog_page.len() == pg_constants::BLCKSZ as usize + 8 {
|
||||
let mut timestamp_bytes = [0u8; 8];
|
||||
timestamp_bytes.copy_from_slice(&clog_page[pg_constants::BLCKSZ as usize..]);
|
||||
let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
|
||||
|
||||
if timestamp >= search_timestamp {
|
||||
*found_larger = true;
|
||||
return Ok(true);
|
||||
} else {
|
||||
*found_smaller = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
/// Get a list of SLRU segments
|
||||
pub fn list_slru_segments(&self, kind: SlruKind, lsn: Lsn) -> Result<HashSet<u32>> {
|
||||
// fetch directory entry
|
||||
@@ -750,7 +642,6 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
||||
}
|
||||
|
||||
/// Extend relation
|
||||
/// If new size is smaller, do nothing.
|
||||
pub fn put_rel_extend(&mut self, rel: RelTag, nblocks: BlockNumber) -> Result<()> {
|
||||
ensure!(rel.relnode != 0, "invalid relnode");
|
||||
|
||||
@@ -758,13 +649,10 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
||||
let size_key = rel_size_to_key(rel);
|
||||
let old_size = self.get(size_key)?.get_u32_le();
|
||||
|
||||
// only extend relation here. never decrease the size
|
||||
if nblocks > old_size {
|
||||
let buf = nblocks.to_le_bytes();
|
||||
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
||||
let buf = nblocks.to_le_bytes();
|
||||
self.put(size_key, Value::Image(Bytes::from(buf.to_vec())));
|
||||
|
||||
self.pending_nblocks += nblocks as isize - old_size as isize;
|
||||
}
|
||||
self.pending_nblocks += nblocks as isize - old_size as isize;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -884,57 +772,6 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Flush changes accumulated so far to the underlying repository.
|
||||
///
|
||||
/// Usually, changes made in DatadirModification are atomic, but this allows
|
||||
/// you to flush them to the underlying repository before the final `commit`.
|
||||
/// That allows to free up the memory used to hold the pending changes.
|
||||
///
|
||||
/// Currently only used during bulk import of a data directory. In that
|
||||
/// context, breaking the atomicity is OK. If the import is interrupted, the
|
||||
/// whole import fails and the timeline will be deleted anyway.
|
||||
/// (Or to be precise, it will be left behind for debugging purposes and
|
||||
/// ignored, see https://github.com/neondatabase/neon/pull/1809)
|
||||
///
|
||||
/// Note: A consequence of flushing the pending operations is that they
|
||||
/// won't be visible to subsequent operations until `commit`. The function
|
||||
/// retains all the metadata, but data pages are flushed. That's again OK
|
||||
/// for bulk import, where you are just loading data pages and won't try to
|
||||
/// modify the same pages twice.
|
||||
pub fn flush(&mut self) -> Result<()> {
|
||||
// Unless we have accumulated a decent amount of changes, it's not worth it
|
||||
// to scan through the pending_updates list.
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
if pending_nblocks < 10000 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let writer = self.tline.tline.writer();
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
let mut result: Result<()> = Ok(());
|
||||
self.pending_updates.retain(|&key, value| {
|
||||
if result.is_ok() && (is_rel_block_key(key) || is_slru_block_key(key)) {
|
||||
result = writer.put(key, self.lsn, value);
|
||||
false
|
||||
} else {
|
||||
true
|
||||
}
|
||||
});
|
||||
result?;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
self.tline.current_logical_size.fetch_add(
|
||||
pending_nblocks * pg_constants::BLCKSZ as isize,
|
||||
Ordering::SeqCst,
|
||||
);
|
||||
self.pending_nblocks = 0;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
/// Finish this atomic update, writing all the updated keys to the
|
||||
/// underlying timeline.
|
||||
@@ -945,7 +782,7 @@ impl<'a, R: Repository> DatadirModification<'a, R> {
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
|
||||
for (key, value) in self.pending_updates {
|
||||
writer.put(key, self.lsn, &value)?;
|
||||
writer.put(key, self.lsn, value)?;
|
||||
}
|
||||
for key_range in self.pending_deletions {
|
||||
writer.delete(key_range.clone(), self.lsn)?;
|
||||
@@ -1350,10 +1187,6 @@ pub fn key_to_rel_block(key: Key) -> Result<(RelTag, BlockNumber)> {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_rel_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x00 && key.field4 != 0
|
||||
}
|
||||
|
||||
pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
||||
Ok(match key.field1 {
|
||||
0x01 => {
|
||||
@@ -1372,12 +1205,6 @@ pub fn key_to_slru_block(key: Key) -> Result<(SlruKind, u32, BlockNumber)> {
|
||||
})
|
||||
}
|
||||
|
||||
fn is_slru_block_key(key: Key) -> bool {
|
||||
key.field1 == 0x01 // SLRU-related
|
||||
&& key.field3 == 0x00000001 // but not SlruDir
|
||||
&& key.field6 != 0xffffffff // and not SlruSegSize
|
||||
}
|
||||
|
||||
//
|
||||
//-- Tests that should work the same with any Repository/Timeline implementation.
|
||||
//
|
||||
@@ -1385,7 +1212,7 @@ fn is_slru_block_key(key: Key) -> bool {
|
||||
#[cfg(test)]
|
||||
pub fn create_test_timeline<R: Repository>(
|
||||
repo: R,
|
||||
timeline_id: utils::zid::ZTimelineId,
|
||||
timeline_id: zenith_utils::zid::ZTimelineId,
|
||||
) -> Result<Arc<crate::DatadirTimeline<R>>> {
|
||||
let tline = repo.create_empty_timeline(timeline_id, Lsn(8))?;
|
||||
let tline = DatadirTimeline::new(tline, 256 * 1024);
|
||||
|
||||
@@ -1,101 +0,0 @@
|
||||
//!
|
||||
//! Support for profiling
|
||||
//!
|
||||
//! This relies on a modified version of the 'pprof-rs' crate. That's not very
|
||||
//! nice, so to avoid a hard dependency on that, this is an optional feature.
|
||||
//!
|
||||
use crate::config::{PageServerConf, ProfilingConfig};
|
||||
|
||||
/// The actual implementation is in the `profiling_impl` submodule. If the profiling
|
||||
/// feature is not enabled, it's just a dummy implementation that panics if you
|
||||
/// try to enabled profiling in the configuration.
|
||||
pub use profiling_impl::*;
|
||||
|
||||
#[cfg(feature = "profiling")]
|
||||
mod profiling_impl {
|
||||
use super::*;
|
||||
use pprof;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
/// Start profiling the current thread. Returns a guard object;
|
||||
/// the profiling continues until the guard is dropped.
|
||||
///
|
||||
/// Note: profiling is not re-entrant. If you call 'profpoint_start' while
|
||||
/// profiling is already started, nothing happens, and the profiling will be
|
||||
/// stopped when either guard object is dropped.
|
||||
#[inline]
|
||||
pub fn profpoint_start(
|
||||
conf: &crate::config::PageServerConf,
|
||||
point: ProfilingConfig,
|
||||
) -> Option<ProfilingGuard> {
|
||||
if conf.profiling == point {
|
||||
pprof::start_profiling();
|
||||
Some(ProfilingGuard(PhantomData))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// A hack to remove Send and Sync from the ProfilingGuard. Because the
|
||||
/// profiling is attached to current thread.
|
||||
////
|
||||
/// See comments in https://github.com/rust-lang/rust/issues/68318
|
||||
type PhantomUnsend = std::marker::PhantomData<*mut u8>;
|
||||
|
||||
pub struct ProfilingGuard(PhantomUnsend);
|
||||
|
||||
impl Drop for ProfilingGuard {
|
||||
fn drop(&mut self) {
|
||||
pprof::stop_profiling();
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the profiler. This must be called before any 'profpoint_start' calls.
|
||||
pub fn init_profiler(conf: &PageServerConf) -> Option<pprof::ProfilerGuard> {
|
||||
if conf.profiling != ProfilingConfig::Disabled {
|
||||
Some(pprof::ProfilerGuardBuilder::default().build().unwrap())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Exit the profiler. Writes the flamegraph to current workdir.
|
||||
pub fn exit_profiler(_conf: &PageServerConf, profiler_guard: &Option<pprof::ProfilerGuard>) {
|
||||
// Write out the flamegraph
|
||||
if let Some(profiler_guard) = profiler_guard {
|
||||
if let Ok(report) = profiler_guard.report().build() {
|
||||
// this gets written under the workdir
|
||||
let file = std::fs::File::create("flamegraph.svg").unwrap();
|
||||
let mut options = pprof::flamegraph::Options::default();
|
||||
options.image_width = Some(2500);
|
||||
report.flamegraph_with_options(file, &mut options).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Dummy implementation when compiling without profiling feature or for non-linux OSes.
|
||||
#[cfg(not(feature = "profiling"))]
|
||||
mod profiling_impl {
|
||||
use super::*;
|
||||
|
||||
pub struct DummyProfilerGuard;
|
||||
|
||||
pub fn profpoint_start(
|
||||
_conf: &PageServerConf,
|
||||
_point: ProfilingConfig,
|
||||
) -> Option<DummyProfilerGuard> {
|
||||
None
|
||||
}
|
||||
|
||||
pub fn init_profiler(conf: &PageServerConf) -> Option<DummyProfilerGuard> {
|
||||
if conf.profiling != ProfilingConfig::Disabled {
|
||||
// shouldn't happen, we don't allow profiling in the config if the support
|
||||
// for it is disabled.
|
||||
panic!("profiling enabled but the binary was compiled without profiling support");
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
pub fn exit_profiler(_conf: &PageServerConf, _guard: &Option<DummyProfilerGuard>) {}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user