mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-25 09:00:37 +00:00
Compare commits
62 Commits
bojan-ci-t
...
bojan-psbe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
20923e70f5 | ||
|
|
98149f1a08 | ||
|
|
fbc4206f2d | ||
|
|
d614291c44 | ||
|
|
35d8167f68 | ||
|
|
2f7d9d2dd5 | ||
|
|
21f9774ea4 | ||
|
|
098d7046f8 | ||
|
|
a39de2997f | ||
|
|
d756921220 | ||
|
|
2bc9ed164f | ||
|
|
8c4d270cde | ||
|
|
35584f7242 | ||
|
|
12141523f6 | ||
|
|
d383ed4e68 | ||
|
|
13ec0ce7b2 | ||
|
|
80fc133833 | ||
|
|
3da14d56f2 | ||
|
|
b0b2093d00 | ||
|
|
02aa7c023a | ||
|
|
180631da1f | ||
|
|
811d46f070 | ||
|
|
728f299641 | ||
|
|
fb49418e7f | ||
|
|
887dc8f112 | ||
|
|
aa7b32d892 | ||
|
|
d7ed9d8e01 | ||
|
|
96c2b3a80a | ||
|
|
7560854370 | ||
|
|
adbbb0a4c8 | ||
|
|
6a264aaca3 | ||
|
|
60ed6b3710 | ||
|
|
89690d7349 | ||
|
|
09f2dff537 | ||
|
|
2d8587f67d | ||
|
|
c559c72ede | ||
|
|
f06707badc | ||
|
|
64cdd6064d | ||
|
|
4bd557ca61 | ||
|
|
ee40297758 | ||
|
|
d5b8380dae | ||
|
|
bce2da4e55 | ||
|
|
3948956e87 | ||
|
|
a726b555fb | ||
|
|
0e3512aad0 | ||
|
|
6fb566b46f | ||
|
|
dd56eeefbf | ||
|
|
d19a293e7e | ||
|
|
be4aebd7e9 | ||
|
|
dac73328ba | ||
|
|
fb79c7f1f0 | ||
|
|
e7bd74d558 | ||
|
|
da8beffc95 | ||
|
|
98ec8418c4 | ||
|
|
92d1322cd5 | ||
|
|
2896d35a8b | ||
|
|
e096c62494 | ||
|
|
356f716d39 | ||
|
|
798ff26fb0 | ||
|
|
28045890eb | ||
|
|
6127b6638b | ||
|
|
c7c1e19667 |
2
.circleci/ansible/.gitignore
vendored
2
.circleci/ansible/.gitignore
vendored
@@ -1,4 +1,2 @@
|
||||
zenith_install.tar.gz
|
||||
.zenith_current_version
|
||||
neon_install.tar.gz
|
||||
.neon_current_version
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
- name: Upload Neon binaries
|
||||
- name: Upload Zenith binaries
|
||||
hosts: storage
|
||||
gather_facts: False
|
||||
remote_user: admin
|
||||
|
||||
tasks:
|
||||
|
||||
- name: get latest version of Neon binaries
|
||||
- name: get latest version of Zenith binaries
|
||||
register: current_version_file
|
||||
set_fact:
|
||||
current_version: "{{ lookup('file', '.neon_current_version') | trim }}"
|
||||
current_version: "{{ lookup('file', '.zenith_current_version') | trim }}"
|
||||
tags:
|
||||
- pageserver
|
||||
- safekeeper
|
||||
@@ -19,11 +19,11 @@
|
||||
- pageserver
|
||||
- safekeeper
|
||||
|
||||
- name: upload and extract Neon binaries to /usr/local
|
||||
- name: upload and extract Zenith binaries to /usr/local
|
||||
ansible.builtin.unarchive:
|
||||
owner: root
|
||||
group: root
|
||||
src: neon_install.tar.gz
|
||||
src: zenith_install.tar.gz
|
||||
dest: /usr/local
|
||||
become: true
|
||||
tags:
|
||||
@@ -63,18 +63,21 @@
|
||||
tags:
|
||||
- pageserver
|
||||
|
||||
- name: update remote storage (s3) config
|
||||
lineinfile:
|
||||
path: /storage/pageserver/data/pageserver.toml
|
||||
line: "{{ item }}"
|
||||
loop:
|
||||
- "[remote_storage]"
|
||||
- "bucket_name = '{{ bucket_name }}'"
|
||||
- "bucket_region = '{{ bucket_region }}'"
|
||||
- "prefix_in_bucket = '{{ inventory_hostname }}'"
|
||||
become: true
|
||||
tags:
|
||||
- pageserver
|
||||
# It seems that currently S3 integration does not play well
|
||||
# even with fresh pageserver without a burden of old data.
|
||||
# TODO: turn this back on once the issue is solved.
|
||||
# - name: update remote storage (s3) config
|
||||
# lineinfile:
|
||||
# path: /storage/pageserver/data/pageserver.toml
|
||||
# line: "{{ item }}"
|
||||
# loop:
|
||||
# - "[remote_storage]"
|
||||
# - "bucket_name = '{{ bucket_name }}'"
|
||||
# - "bucket_region = '{{ bucket_region }}'"
|
||||
# - "prefix_in_bucket = '{{ inventory_hostname }}'"
|
||||
# become: true
|
||||
# tags:
|
||||
# - pageserver
|
||||
|
||||
- name: upload systemd service definition
|
||||
ansible.builtin.template:
|
||||
|
||||
@@ -4,10 +4,10 @@ set -e
|
||||
|
||||
RELEASE=${RELEASE:-false}
|
||||
|
||||
# look at docker hub for latest tag for neon docker image
|
||||
# look at docker hub for latest tag fo zenith docker image
|
||||
if [ "${RELEASE}" = "true" ]; then
|
||||
echo "search latest relase tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
@@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then
|
||||
fi
|
||||
else
|
||||
echo "search latest dev tag"
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
|
||||
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/zenithdb/zenith/tags |jq -r -S '.[].name' | grep -v release | tail -1)
|
||||
if [ -z "${VERSION}" ]; then
|
||||
echo "no any docker tags found, exiting..."
|
||||
exit 1
|
||||
@@ -28,25 +28,25 @@ fi
|
||||
echo "found ${VERSION}"
|
||||
|
||||
# do initial cleanup
|
||||
rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
|
||||
mkdir neon_install
|
||||
rm -rf zenith_install postgres_install.tar.gz zenith_install.tar.gz .zenith_current_version
|
||||
mkdir zenith_install
|
||||
|
||||
# retrive binaries from docker image
|
||||
echo "getting binaries from docker image"
|
||||
docker pull --quiet neondatabase/neon:${TAG}
|
||||
ID=$(docker create neondatabase/neon:${TAG})
|
||||
docker pull --quiet zenithdb/zenith:${TAG}
|
||||
ID=$(docker create zenithdb/zenith:${TAG})
|
||||
docker cp ${ID}:/data/postgres_install.tar.gz .
|
||||
tar -xzf postgres_install.tar.gz -C neon_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy neon_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres neon_install/bin/
|
||||
tar -xzf postgres_install.tar.gz -C zenith_install
|
||||
docker cp ${ID}:/usr/local/bin/pageserver zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/safekeeper zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/proxy zenith_install/bin/
|
||||
docker cp ${ID}:/usr/local/bin/postgres zenith_install/bin/
|
||||
docker rm -vf ${ID}
|
||||
|
||||
# store version to file (for ansible playbooks) and create binaries tarball
|
||||
echo ${VERSION} > neon_install/.neon_current_version
|
||||
echo ${VERSION} > .neon_current_version
|
||||
tar -czf neon_install.tar.gz -C neon_install .
|
||||
echo ${VERSION} > zenith_install/.zenith_current_version
|
||||
echo ${VERSION} > .zenith_current_version
|
||||
tar -czf zenith_install.tar.gz -C zenith_install .
|
||||
|
||||
# do final cleaup
|
||||
rm -rf neon_install postgres_install.tar.gz
|
||||
rm -rf zenith_install postgres_install.tar.gz
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
[pageservers]
|
||||
neon-stress-ps-1 console_region_id=1
|
||||
neon-stress-ps-2 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
neon-stress-sk-1 console_region_id=1
|
||||
neon-stress-sk-2 console_region_id=1
|
||||
neon-stress-sk-3 console_region_id=1
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
safekeepers
|
||||
|
||||
[storage:vars]
|
||||
console_mgmt_base_url = http://neon-stress-console.local
|
||||
bucket_name = neon-storage-ireland
|
||||
bucket_region = eu-west-1
|
||||
etcd_endpoints = etcd-stress.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
@@ -1,6 +1,5 @@
|
||||
[pageservers]
|
||||
#zenith-1-ps-1 console_region_id=1
|
||||
zenith-1-ps-2 console_region_id=1
|
||||
zenith-1-ps-1 console_region_id=1
|
||||
|
||||
[safekeepers]
|
||||
zenith-1-sk-1 console_region_id=1
|
||||
@@ -15,5 +14,3 @@ safekeepers
|
||||
console_mgmt_base_url = http://console-release.local
|
||||
bucket_name = zenith-storage-oregon
|
||||
bucket_region = us-west-2
|
||||
etcd_endpoints = etcd-release.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
|
||||
@@ -4,9 +4,9 @@ zenith-us-stage-ps-2 console_region_id=27
|
||||
|
||||
[safekeepers]
|
||||
zenith-us-stage-sk-1 console_region_id=27
|
||||
zenith-us-stage-sk-2 console_region_id=27
|
||||
zenith-us-stage-sk-3 console_region_id=27
|
||||
zenith-us-stage-sk-4 console_region_id=27
|
||||
zenith-us-stage-sk-5 console_region_id=27
|
||||
zenith-us-stage-sk-6 console_region_id=27
|
||||
|
||||
[storage:children]
|
||||
pageservers
|
||||
@@ -16,5 +16,3 @@ safekeepers
|
||||
console_mgmt_base_url = http://console-staging.local
|
||||
bucket_name = zenith-staging-storage-us-east-1
|
||||
bucket_region = us-east-1
|
||||
etcd_endpoints = etcd-staging.local:2379
|
||||
safekeeper_enable_s3_offload = false
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=pageserver
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
|
||||
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -6,7 +6,7 @@ After=network.target auditd.service
|
||||
Type=simple
|
||||
User=safekeeper
|
||||
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --enable-s3-offload={{ safekeeper_enable_s3_offload }}
|
||||
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data
|
||||
ExecReload=/bin/kill -HUP $MAINPID
|
||||
KillMode=mixed
|
||||
KillSignal=SIGINT
|
||||
|
||||
@@ -1,18 +1,18 @@
|
||||
version: 2.1
|
||||
|
||||
executors:
|
||||
neon-xlarge-executor:
|
||||
zenith-xlarge-executor:
|
||||
resource_class: xlarge
|
||||
docker:
|
||||
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
|
||||
- image: zimg/rust:1.58
|
||||
neon-executor:
|
||||
- image: zimg/rust:1.56
|
||||
zenith-executor:
|
||||
docker:
|
||||
- image: zimg/rust:1.58
|
||||
- image: zimg/rust:1.56
|
||||
|
||||
jobs:
|
||||
check-codestyle-rust:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
@@ -22,7 +22,7 @@ jobs:
|
||||
|
||||
# A job to build postgres
|
||||
build-postgres:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -67,9 +67,9 @@ jobs:
|
||||
paths:
|
||||
- tmp_install
|
||||
|
||||
# A job to build Neon rust code
|
||||
build-neon:
|
||||
executor: neon-xlarge-executor
|
||||
# A job to build zenith rust code
|
||||
build-zenith:
|
||||
executor: zenith-xlarge-executor
|
||||
parameters:
|
||||
build_type:
|
||||
type: enum
|
||||
@@ -113,7 +113,7 @@ jobs:
|
||||
CARGO_FLAGS=
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
CARGO_FLAGS="--release --features profiling"
|
||||
CARGO_FLAGS=--release
|
||||
fi
|
||||
|
||||
export CARGO_INCREMENTAL=0
|
||||
@@ -121,7 +121,7 @@ jobs:
|
||||
export RUSTC_WRAPPER=cachepot
|
||||
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
|
||||
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
|
||||
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
|
||||
cachepot -s
|
||||
|
||||
- save_cache:
|
||||
@@ -132,6 +132,20 @@ jobs:
|
||||
- ~/.cargo/git
|
||||
- target
|
||||
|
||||
# Run style checks
|
||||
# has to run separately from cargo fmt section
|
||||
# since needs to run with dependencies
|
||||
- run:
|
||||
name: cargo clippy
|
||||
command: |
|
||||
if [[ $BUILD_TYPE == "debug" ]]; then
|
||||
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
|
||||
elif [[ $BUILD_TYPE == "release" ]]; then
|
||||
cov_prefix=()
|
||||
fi
|
||||
|
||||
"${cov_prefix[@]}" ./run_clippy.sh
|
||||
|
||||
# Run rust unit tests
|
||||
- run:
|
||||
name: cargo test
|
||||
@@ -209,7 +223,7 @@ jobs:
|
||||
- "*"
|
||||
|
||||
check-codestyle-python:
|
||||
executor: neon-executor
|
||||
executor: zenith-executor
|
||||
steps:
|
||||
- checkout
|
||||
- restore_cache:
|
||||
@@ -222,12 +236,6 @@ jobs:
|
||||
key: v2-python-deps-{{ checksum "poetry.lock" }}
|
||||
paths:
|
||||
- /home/circleci/.cache/pypoetry/virtualenvs
|
||||
- run:
|
||||
name: Print versions
|
||||
when: always
|
||||
command: |
|
||||
poetry run python --version
|
||||
poetry show
|
||||
- run:
|
||||
name: Run yapf to ensure code format
|
||||
when: always
|
||||
@@ -238,7 +246,7 @@ jobs:
|
||||
command: poetry run mypy .
|
||||
|
||||
run-pytest:
|
||||
executor: neon-executor
|
||||
executor: zenith-executor
|
||||
parameters:
|
||||
# pytest args to specify the tests to run.
|
||||
#
|
||||
@@ -293,7 +301,7 @@ jobs:
|
||||
# `Too long with no output` error, if a test is running for a long time.
|
||||
# In that case, tests should have internal timeouts that are less than
|
||||
# no_output_timeout, specified here.
|
||||
no_output_timeout: 1m
|
||||
no_output_timeout: 10m
|
||||
environment:
|
||||
- ZENITH_BIN: /tmp/zenith/bin
|
||||
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
|
||||
@@ -354,7 +362,6 @@ jobs:
|
||||
fi
|
||||
fi
|
||||
- run:
|
||||
# TODO wait for processes to die in case of timeout?
|
||||
# CircleCI artifacts are preserved one file at a time, so skipping
|
||||
# this step isn't a good idea. If you want to extract the
|
||||
# pageserver state, perhaps a tarball would be a better idea.
|
||||
@@ -362,7 +369,7 @@ jobs:
|
||||
when: always
|
||||
command: |
|
||||
du -sh /tmp/test_output/*
|
||||
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
|
||||
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" -delete
|
||||
du -sh /tmp/test_output/*
|
||||
- store_artifacts:
|
||||
path: /tmp/test_output
|
||||
@@ -383,7 +390,7 @@ jobs:
|
||||
- "*"
|
||||
|
||||
coverage-report:
|
||||
executor: neon-xlarge-executor
|
||||
executor: zenith-xlarge-executor
|
||||
steps:
|
||||
- attach_workspace:
|
||||
at: /tmp/zenith
|
||||
@@ -398,7 +405,7 @@ jobs:
|
||||
- run:
|
||||
name: Build coverage report
|
||||
command: |
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/coverage \
|
||||
--dir=/tmp/zenith/coverage report \
|
||||
@@ -409,11 +416,11 @@ jobs:
|
||||
name: Upload coverage report
|
||||
command: |
|
||||
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
|
||||
REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
|
||||
REPORT_URL=https://zenithdb.github.io/zenith-coverage-data/$CIRCLE_SHA1
|
||||
COMMIT_URL=https://github.com/zenithdb/zenith/commit/$CIRCLE_SHA1
|
||||
|
||||
scripts/git-upload \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
|
||||
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/zenithdb/zenith-coverage-data.git \
|
||||
--message="Add code coverage for $COMMIT_URL" \
|
||||
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
|
||||
|
||||
@@ -430,7 +437,7 @@ jobs:
|
||||
\"target_url\": \"$REPORT_URL\"
|
||||
}"
|
||||
|
||||
# Build neondatabase/neon:latest image and push it to Docker hub
|
||||
# Build zenithdb/zenith:latest image and push it to Docker hub
|
||||
docker-image:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -444,18 +451,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:latest
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:latest .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:latest
|
||||
|
||||
# Build neondatabase/compute-node:latest image and push it to Docker hub
|
||||
# Build zenithdb/compute-node:latest image and push it to Docker hub
|
||||
docker-image-compute:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -463,31 +470,31 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build neondatabase/compute-tools:latest image and push it to Docker hub
|
||||
# Build zenithdb/compute-tools:latest image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools .
|
||||
docker push neondatabase/compute-tools:latest
|
||||
--tag zenithdb/compute-tools:latest -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:latest
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:latest
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:latest vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:latest
|
||||
|
||||
# Build production neondatabase/neon:release image and push it to Docker hub
|
||||
# Build production zenithdb/zenith:release image and push it to Docker hub
|
||||
docker-image-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -501,18 +508,18 @@ jobs:
|
||||
- run:
|
||||
name: Build and push Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build \
|
||||
--pull \
|
||||
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
|
||||
docker push neondatabase/neon:${DOCKER_TAG}
|
||||
docker push neondatabase/neon:release
|
||||
--tag zenithdb/zenith:${DOCKER_TAG} --tag zenithdb/zenith:release .
|
||||
docker push zenithdb/zenith:${DOCKER_TAG}
|
||||
docker push zenithdb/zenith:release
|
||||
|
||||
# Build production neondatabase/compute-node:release image and push it to Docker hub
|
||||
# Build production zenithdb/compute-node:release image and push it to Docker hub
|
||||
docker-image-compute-release:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
@@ -520,29 +527,29 @@ jobs:
|
||||
- checkout
|
||||
- setup_remote_docker:
|
||||
docker_layer_caching: true
|
||||
# Build neondatabase/compute-tools:release image and push it to Docker hub
|
||||
# Build zenithdb/compute-tools:release image and push it to Docker hub
|
||||
# TODO: this should probably also use versioned tag, not just :latest.
|
||||
# XXX: but should it? We build and use it only locally now.
|
||||
- run:
|
||||
name: Build and push compute-tools Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
docker build \
|
||||
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
|
||||
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
|
||||
--tag neondatabase/compute-tools:release -f Dockerfile.compute-tools .
|
||||
docker push neondatabase/compute-tools:release
|
||||
--tag zenithdb/compute-tools:release -f Dockerfile.compute-tools .
|
||||
docker push zenithdb/compute-tools:release
|
||||
- run:
|
||||
name: Init postgres submodule
|
||||
command: git submodule update --init --depth 1
|
||||
- run:
|
||||
name: Build and push compute-node Docker image
|
||||
command: |
|
||||
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
|
||||
echo $DOCKER_PWD | docker login -u $DOCKER_LOGIN --password-stdin
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres
|
||||
docker push neondatabase/compute-node:${DOCKER_TAG}
|
||||
docker push neondatabase/compute-node:release
|
||||
docker build --tag zenithdb/compute-node:${DOCKER_TAG} --tag zenithdb/compute-node:release vendor/postgres
|
||||
docker push zenithdb/compute-node:${DOCKER_TAG}
|
||||
docker push zenithdb/compute-node:release
|
||||
|
||||
deploy-staging:
|
||||
docker:
|
||||
@@ -568,7 +575,7 @@ jobs:
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i staging.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
|
||||
deploy-staging-proxy:
|
||||
docker:
|
||||
@@ -586,63 +593,13 @@ jobs:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
helm repo add zenithdb https://zenithdb.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-neon-stress:
|
||||
docker:
|
||||
- image: cimg/python:3.10
|
||||
steps:
|
||||
- checkout
|
||||
- setup_remote_docker
|
||||
- run:
|
||||
name: Setup ansible
|
||||
command: |
|
||||
pip install --progress-bar off --user ansible boto3
|
||||
- run:
|
||||
name: Redeploy
|
||||
command: |
|
||||
cd "$(pwd)/.circleci/ansible"
|
||||
|
||||
./get_binaries.sh
|
||||
|
||||
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
|
||||
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
|
||||
chmod 0600 ssh-key
|
||||
ssh-add ssh-key
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i neon-stress.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
|
||||
deploy-neon-stress-proxy:
|
||||
docker:
|
||||
- image: cimg/base:2021.04
|
||||
environment:
|
||||
KUBECONFIG: .kubeconfig
|
||||
steps:
|
||||
- checkout
|
||||
- run:
|
||||
name: Store kubeconfig file
|
||||
command: |
|
||||
echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
|
||||
chmod 0600 ${KUBECONFIG}
|
||||
- run:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG=$(git log --oneline|wc -l)
|
||||
helm upgrade neon-stress-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
deploy-release:
|
||||
docker:
|
||||
@@ -668,7 +625,7 @@ jobs:
|
||||
rm -f ssh-key ssh-key-cert.pub
|
||||
|
||||
ansible-playbook deploy.yaml -i production.hosts
|
||||
rm -f neon_install.tar.gz .neon_current_version
|
||||
rm -f zenith_install.tar.gz .zenith_current_version
|
||||
|
||||
deploy-release-proxy:
|
||||
docker:
|
||||
@@ -686,13 +643,12 @@ jobs:
|
||||
name: Setup helm v3
|
||||
command: |
|
||||
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
|
||||
helm repo add neondatabase https://neondatabase.github.io/helm-charts
|
||||
helm repo add zenithdb https://zenithdb.github.io/helm-charts
|
||||
- run:
|
||||
name: Re-deploy proxy
|
||||
command: |
|
||||
DOCKER_TAG="release-$(git log --oneline|wc -l)"
|
||||
helm upgrade neon-proxy neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
|
||||
|
||||
# Trigger a new remote CI job
|
||||
remote-ci-trigger:
|
||||
@@ -716,7 +672,7 @@ jobs:
|
||||
--data \
|
||||
"{
|
||||
\"state\": \"pending\",
|
||||
\"context\": \"neon-cloud-e2e\",
|
||||
\"context\": \"zenith-remote-ci\",
|
||||
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
|
||||
}"
|
||||
- run:
|
||||
@@ -732,7 +688,7 @@ jobs:
|
||||
"{
|
||||
\"ref\": \"main\",
|
||||
\"inputs\": {
|
||||
\"ci_job_name\": \"neon-cloud-e2e\",
|
||||
\"ci_job_name\": \"zenith-remote-ci\",
|
||||
\"commit_hash\": \"$CIRCLE_SHA1\",
|
||||
\"remote_repo\": \"$LOCAL_REPO\"
|
||||
}
|
||||
@@ -748,8 +704,8 @@ workflows:
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
- build-neon:
|
||||
name: build-neon-<< matrix.build_type >>
|
||||
- build-zenith:
|
||||
name: build-zenith-<< matrix.build_type >>
|
||||
matrix:
|
||||
parameters:
|
||||
build_type: ["debug", "release"]
|
||||
@@ -764,7 +720,7 @@ workflows:
|
||||
test_selection: batch_pg_regress
|
||||
needs_postgres_source: true
|
||||
requires:
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: other-tests-<< matrix.build_type >>
|
||||
matrix:
|
||||
@@ -772,7 +728,7 @@ workflows:
|
||||
build_type: ["debug", "release"]
|
||||
test_selection: batch_others
|
||||
requires:
|
||||
- build-neon-<< matrix.build_type >>
|
||||
- build-zenith-<< matrix.build_type >>
|
||||
- run-pytest:
|
||||
name: benchmarks
|
||||
context: PERF_TEST_RESULT_CONNSTR
|
||||
@@ -781,7 +737,7 @@ workflows:
|
||||
run_in_parallel: false
|
||||
save_perf_report: true
|
||||
requires:
|
||||
- build-neon-release
|
||||
- build-zenith-release
|
||||
- coverage-report:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
@@ -829,25 +785,6 @@ workflows:
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- deploy-neon-stress:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
- deploy-neon-stress-proxy:
|
||||
# deploy only for commits to main
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
requires:
|
||||
- docker-image
|
||||
|
||||
- docker-image-release:
|
||||
# Context gives an ability to login
|
||||
context: Docker Hub
|
||||
@@ -891,11 +828,11 @@ workflows:
|
||||
- remote-ci-trigger:
|
||||
# Context passes credentials for gh api
|
||||
context: CI_ACCESS_TOKEN
|
||||
remote_repo: "neondatabase/cloud"
|
||||
remote_repo: "zenithdb/console"
|
||||
requires:
|
||||
# XXX: Successful build doesn't mean everything is OK, but
|
||||
# the job to be triggered takes so much time to complete (~22 min)
|
||||
# that it's better not to wait for the commented-out steps
|
||||
- build-neon-release
|
||||
- build-zenith-debug
|
||||
# - pg_regress-tests-release
|
||||
# - other-tests-release
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
fullnameOverride: "neon-stress-proxy-scram"
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://neon-stress-console.local/management/api/v2"
|
||||
domain: "*.stress.neon.tech"
|
||||
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: staging
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: ireland
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,34 +0,0 @@
|
||||
fullnameOverride: "neon-stress-proxy"
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.dev.neon.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy
|
||||
zenith_env: staging
|
||||
zenith_region: eu-west-1
|
||||
zenith_region_slug: ireland
|
||||
|
||||
service:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
|
||||
external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
|
||||
type: LoadBalancer
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,24 +0,0 @@
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-release.local/management/api/v2"
|
||||
domain: "*.cloud.neon.tech"
|
||||
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: production
|
||||
zenith_region: us-west-2
|
||||
zenith_region_slug: oregon
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,6 +1,9 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.neon.tech/psql_session/"
|
||||
authEndpoint: "https://console.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.zenith.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -22,7 +25,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: start.zenith.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authBackend: "console"
|
||||
authEndpoint: "http://console-staging.local/management/api/v2"
|
||||
domain: "*.cloud.stage.neon.tech"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
zenith_service: proxy-scram
|
||||
zenith_env: staging
|
||||
zenith_region: us-east-1
|
||||
zenith_region_slug: virginia
|
||||
|
||||
exposedService:
|
||||
annotations:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
selector:
|
||||
release: kube-prometheus-stack
|
||||
@@ -1,12 +1,9 @@
|
||||
# Helm chart values for zenith-proxy.
|
||||
# This is a YAML-formatted file.
|
||||
|
||||
image:
|
||||
repository: neondatabase/neon
|
||||
|
||||
settings:
|
||||
authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.neon.tech/psql_session/"
|
||||
authEndpoint: "https://console.stage.zenith.tech/authenticate_proxy_request/"
|
||||
uri: "https://console.stage.zenith.tech/psql_session/"
|
||||
|
||||
# -- Additional labels for zenith-proxy pods
|
||||
podLabels:
|
||||
@@ -20,7 +17,7 @@ exposedService:
|
||||
service.beta.kubernetes.io/aws-load-balancer-type: external
|
||||
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
|
||||
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
|
||||
external-dns.alpha.kubernetes.io/hostname: connect.stage.neon.tech
|
||||
external-dns.alpha.kubernetes.io/hostname: start.stage.zenith.tech
|
||||
|
||||
metrics:
|
||||
enabled: true
|
||||
|
||||
@@ -10,8 +10,6 @@ dep-format-version = "2"
|
||||
# Hakari works much better with the new feature resolver.
|
||||
# For more about the new feature resolver, see:
|
||||
# https://blog.rust-lang.org/2021/03/25/Rust-1.51.0.html#cargos-new-feature-resolver
|
||||
# Have to keep the resolver still here since hakari requires this field,
|
||||
# despite it's now the default for 2021 edition & cargo.
|
||||
resolver = "2"
|
||||
|
||||
# Add triples corresponding to platforms commonly used by developers here.
|
||||
|
||||
13
.github/workflows/benchmarking.yml
vendored
13
.github/workflows/benchmarking.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
runs-on: [self-hosted, zenith-benchmarker]
|
||||
|
||||
env:
|
||||
POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
|
||||
PG_BIN: "/usr/pgsql-13/bin"
|
||||
|
||||
steps:
|
||||
- name: Checkout zenith repo
|
||||
@@ -51,7 +51,7 @@ jobs:
|
||||
echo Poetry
|
||||
poetry --version
|
||||
echo Pgbench
|
||||
$POSTGRES_DISTRIB_DIR/bin/pgbench --version
|
||||
$PG_BIN/pgbench --version
|
||||
|
||||
# FIXME cluster setup is skipped due to various changes in console API
|
||||
# for now pre created cluster is used. When API gain some stability
|
||||
@@ -66,7 +66,7 @@ jobs:
|
||||
|
||||
echo "Starting cluster"
|
||||
# wake up the cluster
|
||||
$POSTGRES_DISTRIB_DIR/bin/psql $BENCHMARK_CONNSTR -c "SELECT 1"
|
||||
$PG_BIN/psql $BENCHMARK_CONNSTR -c "SELECT 1"
|
||||
|
||||
- name: Run benchmark
|
||||
# pgbench is installed system wide from official repo
|
||||
@@ -83,11 +83,8 @@ jobs:
|
||||
# sudo yum install postgresql13-contrib
|
||||
# actual binaries are located in /usr/pgsql-13/bin/
|
||||
env:
|
||||
# The pgbench test runs two tests of given duration against each scale.
|
||||
# So the total runtime with these parameters is 2 * 2 * 300 = 1200, or 20 minutes.
|
||||
# Plus time needed to initialize the test databases.
|
||||
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
|
||||
TEST_PG_BENCH_TRANSACTIONS_MATRIX: "5000,10000,20000"
|
||||
TEST_PG_BENCH_SCALES_MATRIX: "10,15"
|
||||
PLATFORM: "zenith-staging"
|
||||
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
|
||||
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
|
||||
|
||||
21
.github/workflows/testing.yml
vendored
21
.github/workflows/testing.yml
vendored
@@ -1,8 +1,6 @@
|
||||
name: Build and Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
on: push
|
||||
|
||||
jobs:
|
||||
regression-check:
|
||||
@@ -38,7 +36,8 @@ jobs:
|
||||
|
||||
- name: Install macOs postgres dependencies
|
||||
if: matrix.os == 'macos-latest'
|
||||
run: brew install flex bison
|
||||
run: |
|
||||
brew install flex bison
|
||||
|
||||
- name: Set pg revision for caching
|
||||
id: pg_ver
|
||||
@@ -54,7 +53,8 @@ jobs:
|
||||
|
||||
- name: Build postgres
|
||||
if: steps.cache_pg.outputs.cache-hit != 'true'
|
||||
run: make postgres
|
||||
run: |
|
||||
make postgres
|
||||
|
||||
- name: Cache cargo deps
|
||||
id: cache_cargo
|
||||
@@ -64,10 +64,13 @@ jobs:
|
||||
~/.cargo/registry
|
||||
~/.cargo/git
|
||||
target
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}
|
||||
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
|
||||
|
||||
- name: Run cargo clippy
|
||||
run: ./run_clippy.sh
|
||||
# Use `env CARGO_INCREMENTAL=0` to mitigate https://github.com/rust-lang/rust/issues/91696 for rustc 1.57.0
|
||||
- name: Run cargo build
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo build --workspace --bins --examples --tests
|
||||
|
||||
- name: Run cargo test
|
||||
run: cargo test --all --all-targets
|
||||
run: |
|
||||
env CARGO_INCREMENTAL=0 cargo test -- --nocapture --test-threads=1
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -11,6 +11,3 @@ test_output/
|
||||
# Coverage
|
||||
*.profraw
|
||||
*.profdata
|
||||
|
||||
*.key
|
||||
*.crt
|
||||
|
||||
20
COPYRIGHT
Normal file
20
COPYRIGHT
Normal file
@@ -0,0 +1,20 @@
|
||||
This software is licensed under the Apache 2.0 License:
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
Copyright 2021 Zenith Labs, Inc
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
The PostgreSQL submodule in vendor/postgres is licensed under the
|
||||
PostgreSQL license. See vendor/postgres/COPYRIGHT.
|
||||
1270
Cargo.lock
generated
1270
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
15
Cargo.toml
15
Cargo.toml
@@ -3,19 +3,22 @@ members = [
|
||||
"compute_tools",
|
||||
"control_plane",
|
||||
"pageserver",
|
||||
"postgres_ffi",
|
||||
"proxy",
|
||||
"safekeeper",
|
||||
"walkeeper",
|
||||
"workspace_hack",
|
||||
"neon_local",
|
||||
"libs/*",
|
||||
"zenith",
|
||||
"zenith_metrics",
|
||||
"zenith_utils",
|
||||
]
|
||||
resolver = "2"
|
||||
|
||||
[profile.release]
|
||||
# This is useful for profiling and, to some extent, debug.
|
||||
# Besides, debug info should not affect the performance.
|
||||
debug = true
|
||||
|
||||
# This is only needed for proxy's tests.
|
||||
# TODO: we should probably fork `tokio-postgres-rustls` instead.
|
||||
# This is only needed for proxy's tests
|
||||
# TODO: we should probably fork tokio-postgres-rustls instead
|
||||
[patch.crates-io]
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
|
||||
15
Dockerfile
15
Dockerfile
@@ -1,5 +1,7 @@
|
||||
# Build Postgres
|
||||
FROM zimg/rust:1.58 AS pg-build
|
||||
#
|
||||
#FROM zimg/rust:1.56 AS pg-build
|
||||
FROM zenithdb/build:buster-20220309 AS pg-build
|
||||
WORKDIR /pg
|
||||
|
||||
USER root
|
||||
@@ -9,26 +11,27 @@ COPY Makefile Makefile
|
||||
|
||||
ENV BUILD_TYPE release
|
||||
RUN set -e \
|
||||
&& mold -run make -j $(nproc) -s postgres \
|
||||
&& make -j $(nproc) -s postgres \
|
||||
&& rm -rf tmp_install/build \
|
||||
&& tar -C tmp_install -czf /postgres_install.tar.gz .
|
||||
|
||||
# Build zenith binaries
|
||||
FROM zimg/rust:1.58 AS build
|
||||
#
|
||||
#FROM zimg/rust:1.56 AS build
|
||||
FROM zenithdb/build:buster-20220309 AS build
|
||||
ARG GIT_VERSION=local
|
||||
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
ARG AWS_ACCESS_KEY_ID
|
||||
ARG AWS_SECRET_ACCESS_KEY
|
||||
ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot
|
||||
|
||||
COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
|
||||
COPY . .
|
||||
|
||||
# Show build caching stats to check if it was used in the end.
|
||||
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats.
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build --release \
|
||||
&& cachepot -s
|
||||
RUN cargo build --release && /usr/local/cargo/bin/cachepot -s
|
||||
|
||||
# Build final image
|
||||
#
|
||||
|
||||
23
Dockerfile.build
Normal file
23
Dockerfile.build
Normal file
@@ -0,0 +1,23 @@
|
||||
FROM rust:1.56.1-slim-buster
|
||||
WORKDIR /home/circleci/project
|
||||
|
||||
RUN set -e \
|
||||
&& apt-get update \
|
||||
&& apt-get -yq install \
|
||||
automake \
|
||||
libtool \
|
||||
build-essential \
|
||||
bison \
|
||||
flex \
|
||||
libreadline-dev \
|
||||
zlib1g-dev \
|
||||
libxml2-dev \
|
||||
libseccomp-dev \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
clang
|
||||
|
||||
RUN set -e \
|
||||
&& rustup component add clippy \
|
||||
&& cargo install cargo-audit \
|
||||
&& cargo install --git https://github.com/paritytech/cachepot
|
||||
@@ -1,18 +1,19 @@
|
||||
# First transient image to build compute_tools binaries
|
||||
# NB: keep in sync with rust image version in .circle/config.yml
|
||||
FROM zimg/rust:1.58 AS rust-build
|
||||
FROM zenithdb/build:buster-20220309 AS rust-build
|
||||
|
||||
WORKDIR /zenith
|
||||
|
||||
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
|
||||
ARG AWS_ACCESS_KEY_ID
|
||||
ARG AWS_SECRET_ACCESS_KEY
|
||||
ENV RUSTC_WRAPPER /usr/local/cargo/bin/cachepot
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN set -e \
|
||||
&& sudo -E "PATH=$PATH" mold -run cargo build -p compute_tools --release \
|
||||
&& cachepot -s
|
||||
RUN cargo build -p compute_tools --release && /usr/local/cargo/bin/cachepot -s
|
||||
|
||||
# Final image that only has one binary
|
||||
FROM debian:buster-slim
|
||||
|
||||
COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
|
||||
COPY --from=rust-build /zenith/target/release/zenith_ctl /usr/local/bin/zenith_ctl
|
||||
|
||||
12
Makefile
12
Makefile
@@ -12,21 +12,15 @@ endif
|
||||
#
|
||||
BUILD_TYPE ?= debug
|
||||
ifeq ($(BUILD_TYPE),release)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
|
||||
PG_CONFIGURE_OPTS = --enable-debug
|
||||
PG_CFLAGS = -O2 -g3 $(CFLAGS)
|
||||
# Unfortunately, `--profile=...` is a nightly feature
|
||||
CARGO_BUILD_FLAGS += --release
|
||||
else ifeq ($(BUILD_TYPE),debug)
|
||||
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
|
||||
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
|
||||
PG_CFLAGS = -O0 -g3 $(CFLAGS)
|
||||
else
|
||||
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# macOS with brew-installed openssl requires explicit paths
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
PG_CONFIGURE_OPTS += --with-includes=/usr/local/opt/openssl/include --with-libraries=/usr/local/opt/openssl/lib
|
||||
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
|
||||
endif
|
||||
|
||||
# Choose whether we should be silent or verbose
|
||||
|
||||
5
NOTICE
5
NOTICE
@@ -1,5 +0,0 @@
|
||||
Neon
|
||||
Copyright 2022 Neon Inc.
|
||||
|
||||
The PostgreSQL submodule in vendor/postgres is licensed under the
|
||||
PostgreSQL license. See vendor/postgres/COPYRIGHT.
|
||||
153
README.md
153
README.md
@@ -1,122 +1,80 @@
|
||||
# Neon
|
||||
# Zenith
|
||||
|
||||
Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
The project used to be called "Zenith". Many of the commands and code comments
|
||||
still refer to "zenith", but we are in the process of renaming things.
|
||||
Zenith is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
|
||||
|
||||
## Architecture overview
|
||||
|
||||
A Neon installation consists of compute nodes and Neon storage engine.
|
||||
A Zenith installation consists of compute nodes and Zenith storage engine.
|
||||
|
||||
Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine.
|
||||
Compute nodes are stateless PostgreSQL nodes, backed by Zenith storage engine.
|
||||
|
||||
Neon storage engine consists of two major components:
|
||||
Zenith storage engine consists of two major components:
|
||||
- Pageserver. Scalable storage backend for compute nodes.
|
||||
- WAL service. The service that receives WAL from compute node and ensures that it is stored durably.
|
||||
|
||||
Pageserver consists of:
|
||||
- Repository - Neon storage implementation.
|
||||
- Repository - Zenith storage implementation.
|
||||
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
|
||||
- Page service - service that communicates with compute nodes and responds with pages from the repository.
|
||||
- WAL redo - service that builds pages from base images and WAL records on Page service request.
|
||||
|
||||
## Running local installation
|
||||
|
||||
|
||||
#### building on Ubuntu/ Debian (Linux)
|
||||
1. Install build dependencies and other useful packages
|
||||
|
||||
On Ubuntu or Debian this set of packages should be sufficient to build the code:
|
||||
```text
|
||||
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
|
||||
libssl-dev clang pkg-config libpq-dev libprotobuf-dev etcd
|
||||
libssl-dev clang pkg-config libpq-dev
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
[Rust] 1.56.1 or later is also required.
|
||||
|
||||
3. Install PostgreSQL Client
|
||||
```
|
||||
apt install postgresql-client
|
||||
```
|
||||
|
||||
4. Build neon and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
make -j5
|
||||
```
|
||||
|
||||
#### building on OSX (12.3.1)
|
||||
1. Install XCode and dependencies
|
||||
```
|
||||
xcode-select --install
|
||||
brew install protobuf etcd
|
||||
```
|
||||
|
||||
2. [Install Rust](https://www.rust-lang.org/tools/install)
|
||||
```
|
||||
# recommended approach from https://www.rust-lang.org/tools/install
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
|
||||
3. Install PostgreSQL Client
|
||||
```
|
||||
# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos
|
||||
brew install libpq
|
||||
brew link --force libpq
|
||||
```
|
||||
|
||||
4. Build neon and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
cd neon
|
||||
make -j5
|
||||
```
|
||||
|
||||
#### dependency installation notes
|
||||
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
|
||||
|
||||
To run the integration tests or Python scripts (not required to use the code), install
|
||||
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
|
||||
Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
|
||||
|
||||
2. Build zenith and patched postgres
|
||||
```sh
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
cd zenith
|
||||
make -j5
|
||||
```
|
||||
|
||||
#### running neon database
|
||||
1. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
3. Start pageserver and postgres on top of it (should be called from repo root):
|
||||
```sh
|
||||
# Create repository in .zenith with proper paths to binaries and data
|
||||
# Later that would be responsibility of a package install script
|
||||
> ./target/debug/neon_local init
|
||||
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
|
||||
created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
|
||||
initial timeline de200bd42b49cc1814412c7e592dd6e9 created
|
||||
> ./target/debug/zenith init
|
||||
initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
|
||||
created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
|
||||
created main branch
|
||||
pageserver init succeeded
|
||||
|
||||
# start pageserver and safekeeper
|
||||
> ./target/debug/neon_local start
|
||||
Starting pageserver at '127.0.0.1:64000' in '.zenith'
|
||||
> ./target/debug/zenith start
|
||||
Starting pageserver at 'localhost:64000' in '.zenith'
|
||||
Pageserver started
|
||||
initializing for sk 1 for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
|
||||
initializing for single for 7676
|
||||
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
|
||||
Safekeeper started
|
||||
|
||||
# start postgres compute node
|
||||
> ./target/debug/neon_local pg start main
|
||||
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
|
||||
> ./target/debug/zenith pg start main
|
||||
Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
|
||||
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
|
||||
waiting for server to start.... done
|
||||
server started
|
||||
|
||||
# check list of running postgres instances
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running
|
||||
> ./target/debug/zenith pg list
|
||||
NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running
|
||||
```
|
||||
|
||||
2. Now it is possible to connect to postgres and run some queries:
|
||||
4. Now it is possible to connect to postgres and run some queries:
|
||||
```text
|
||||
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
|
||||
postgres=# CREATE TABLE t(key int primary key, value text);
|
||||
@@ -130,28 +88,21 @@ postgres=# select * from t;
|
||||
(1 row)
|
||||
```
|
||||
|
||||
3. And create branches and run postgres on them:
|
||||
5. And create branches and run postgres on them:
|
||||
```sh
|
||||
# create branch named migration_check
|
||||
> ./target/debug/neon_local timeline branch --branch-name migration_check
|
||||
Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
|
||||
> ./target/debug/zenith timeline branch --branch-name migration_check
|
||||
Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
|
||||
|
||||
# check branches tree
|
||||
> ./target/debug/neon_local timeline list
|
||||
(L) main [de200bd42b49cc1814412c7e592dd6e9]
|
||||
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
|
||||
> ./target/debug/zenith timeline list
|
||||
main [5b014a9e41b4b63ce1a1febc04503636]
|
||||
┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
|
||||
|
||||
# start postgres on that branch
|
||||
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
|
||||
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
|
||||
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=zenith_admin dbname=postgres'
|
||||
|
||||
# check the new list of running postgres instances
|
||||
> ./target/debug/neon_local pg list
|
||||
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
|
||||
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
|
||||
migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
|
||||
> ./target/debug/zenith pg start migration_check
|
||||
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
|
||||
waiting for server to start.... done
|
||||
|
||||
# this new postgres instance will have all the data from 'main' postgres,
|
||||
# but all modifications would not affect data in original postgres
|
||||
@@ -164,26 +115,18 @@ postgres=# select * from t;
|
||||
|
||||
postgres=# insert into t values(2,2);
|
||||
INSERT 0 1
|
||||
|
||||
# check that the new change doesn't affect the 'main' postgres
|
||||
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
|
||||
postgres=# select * from t;
|
||||
key | value
|
||||
-----+-------
|
||||
1 | 1
|
||||
(1 row)
|
||||
```
|
||||
|
||||
4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
|
||||
you have just started. You can stop them all with one command:
|
||||
```sh
|
||||
> ./target/debug/neon_local stop
|
||||
> ./target/debug/zenith stop
|
||||
```
|
||||
|
||||
## Running tests
|
||||
|
||||
```sh
|
||||
git clone --recursive https://github.com/neondatabase/neon.git
|
||||
git clone --recursive https://github.com/zenithdb/zenith.git
|
||||
make # builds also postgres and installs it to ./tmp_install
|
||||
./scripts/pytest
|
||||
```
|
||||
@@ -198,14 +141,14 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d
|
||||
|
||||
### Postgres-specific terms
|
||||
|
||||
Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used.
|
||||
Due to Zenith's very close relation with PostgreSQL internals, there are numerous specific terms used.
|
||||
Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
|
||||
|
||||
To get more familiar with this aspect, refer to:
|
||||
|
||||
- [Neon glossary](/docs/glossary.md)
|
||||
- [Zenith glossary](/docs/glossary.md)
|
||||
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
|
||||
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
|
||||
- Other PostgreSQL documentation and sources (Zenith fork sources can be found [here](https://github.com/zenithdb/postgres))
|
||||
|
||||
## Join the development
|
||||
|
||||
|
||||
@@ -11,11 +11,10 @@ clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
log = { version = "0.4", features = ["std", "serde"] }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="9eb0dbfbeb6a6c1b79099b9f7ae4a8c021877858" }
|
||||
regex = "1"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
tar = "0.4"
|
||||
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
# Compute node tools
|
||||
|
||||
Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
`ExecStart` option. It will handle all the `Neon` specifics during compute node
|
||||
Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
`ExecStart` option. It will handle all the `zenith` specifics during compute node
|
||||
initialization:
|
||||
- `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
- `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
- Every start is a fresh start, so the data directory is removed and
|
||||
initialized again on each run.
|
||||
- Next it will put configuration files into the `PGDATA` directory.
|
||||
@@ -13,18 +13,18 @@ initialization:
|
||||
- Check and alter/drop/create roles and databases.
|
||||
- Hang waiting on the `postmaster` process to exit.
|
||||
|
||||
Also `compute_ctl` spawns two separate service threads:
|
||||
Also `zenith_ctl` spawns two separate service threads:
|
||||
- `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
into the shared `ComputeNode`;
|
||||
into the shared `ComputeState`;
|
||||
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
last activity requests.
|
||||
|
||||
Usage example:
|
||||
```sh
|
||||
compute_ctl -D /var/db/postgres/compute \
|
||||
-C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
-S /var/db/postgres/specs/current.json \
|
||||
-b /usr/local/bin/postgres
|
||||
zenith_ctl -D /var/db/postgres/compute \
|
||||
-C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
-S /var/db/postgres/specs/current.json \
|
||||
-b /usr/local/bin/postgres
|
||||
```
|
||||
|
||||
## Tests
|
||||
|
||||
@@ -1,174 +0,0 @@
|
||||
//!
|
||||
//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
//! `ExecStart` option. It will handle all the `Neon` specifics during compute node
|
||||
//! initialization:
|
||||
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
//! - Try to start `postgres` and wait until it is ready to accept connections.
|
||||
//! - Check and alter/drop/create roles and databases.
|
||||
//! - Hang waiting on the `postmaster` process to exit.
|
||||
//!
|
||||
//! Also `compute_ctl` spawns two separate service threads:
|
||||
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
//! into the shared `ComputeNode`;
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
//! compute_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::exit;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{thread, time::Duration};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use log::{error, info};
|
||||
|
||||
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
|
||||
use compute_tools::http::api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
||||
let matches = clap::App::new("compute_ctl")
|
||||
.version(version.unwrap_or("unknown"))
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.value_of("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.value_of("spec");
|
||||
let spec_path = matches.value_of("spec-path");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
||||
|
||||
let spec: ComputeSpec = match spec {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => serde_json::from_str(json)?,
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
} else {
|
||||
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.page_server_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_tenant")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_timeline")
|
||||
.expect("tenant id should be provided");
|
||||
|
||||
let compute_state = ComputeNode {
|
||||
start_time: Utc::now(),
|
||||
connstr: connstr.to_string(),
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
tenant,
|
||||
timeline,
|
||||
pageserver_connstr,
|
||||
metrics: ComputeMetrics::new(),
|
||||
state: RwLock::new(ComputeState::new()),
|
||||
};
|
||||
let compute = Arc::new(compute_state);
|
||||
|
||||
// Launch service threads first, so we were able to serve availability
|
||||
// requests, while configuration is still in progress.
|
||||
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
|
||||
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
|
||||
|
||||
// Run compute (Postgres) and hang waiting on it.
|
||||
match compute.prepare_and_run() {
|
||||
Ok(ec) => {
|
||||
let code = ec.code().unwrap_or(1);
|
||||
info!("Postgres exited with code {}, shutting down", code);
|
||||
exit(code)
|
||||
}
|
||||
Err(error) => {
|
||||
error!("could not start the compute node: {}", error);
|
||||
|
||||
let mut state = compute.state.write().unwrap();
|
||||
state.error = Some(format!("{:?}", error));
|
||||
state.status = ComputeStatus::Failed;
|
||||
drop(state);
|
||||
|
||||
// Keep serving HTTP requests, so the cloud control plane was able to
|
||||
// get the actual error.
|
||||
info!("giving control plane 30s to collect the error before shutdown");
|
||||
thread::sleep(Duration::from_secs(30));
|
||||
info!("shutting down");
|
||||
Err(error)
|
||||
}
|
||||
}
|
||||
}
|
||||
249
compute_tools/src/bin/zenith_ctl.rs
Normal file
249
compute_tools/src/bin/zenith_ctl.rs
Normal file
@@ -0,0 +1,249 @@
|
||||
//!
|
||||
//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
|
||||
//! `ExecStart` option. It will handle all the `zenith` specifics during compute node
|
||||
//! initialization:
|
||||
//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
|
||||
//! - Every start is a fresh start, so the data directory is removed and
|
||||
//! initialized again on each run.
|
||||
//! - Next it will put configuration files into the `PGDATA` directory.
|
||||
//! - Sync safekeepers and get commit LSN.
|
||||
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
|
||||
//! - Try to start `postgres` and wait until it is ready to accept connections.
|
||||
//! - Check and alter/drop/create roles and databases.
|
||||
//! - Hang waiting on the `postmaster` process to exit.
|
||||
//!
|
||||
//! Also `zenith_ctl` spawns two separate service threads:
|
||||
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
|
||||
//! into the shared `ComputeState`;
|
||||
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
|
||||
//! last activity requests.
|
||||
//!
|
||||
//! Usage example:
|
||||
//! ```sh
|
||||
//! zenith_ctl -D /var/db/postgres/compute \
|
||||
//! -C 'postgresql://zenith_admin@localhost/postgres' \
|
||||
//! -S /var/db/postgres/specs/current.json \
|
||||
//! -b /usr/local/bin/postgres
|
||||
//! ```
|
||||
//!
|
||||
use std::fs::File;
|
||||
use std::panic;
|
||||
use std::path::Path;
|
||||
use std::process::{exit, Command, ExitStatus};
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Arg;
|
||||
use log::info;
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use compute_tools::config;
|
||||
use compute_tools::http_api::launch_http_server;
|
||||
use compute_tools::logger::*;
|
||||
use compute_tools::monitor::launch_monitor;
|
||||
use compute_tools::params::*;
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::*;
|
||||
use compute_tools::zenith::*;
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
|
||||
let state = state.read().unwrap();
|
||||
let spec = &state.spec;
|
||||
let pgdata_path = Path::new(&state.pgdata);
|
||||
let pageserver_connstr = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.page_server_connstring")
|
||||
.expect("pageserver connstr should be provided");
|
||||
let tenant = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_tenant")
|
||||
.expect("tenant id should be provided");
|
||||
let timeline = spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("zenith.zenith_timeline")
|
||||
.expect("tenant id should be provided");
|
||||
|
||||
info!(
|
||||
"starting cluster #{}, operation #{}",
|
||||
spec.cluster.cluster_id,
|
||||
spec.operation_uuid.as_ref().unwrap()
|
||||
);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
create_pgdata(&state.pgdata)?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, pageserver_connstr
|
||||
);
|
||||
get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context(
|
||||
|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, pageserver_connstr
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
|
||||
let read_state = state.read().unwrap();
|
||||
let pgdata_path = Path::new(&read_state.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&read_state.pgbin)
|
||||
.args(&["-D", &read_state.pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Try default Postgres port if it is not provided
|
||||
let port = read_state
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("port")
|
||||
.unwrap_or_else(|| "5432".to_string());
|
||||
wait_for_postgres(&port, pgdata_path)?;
|
||||
|
||||
let mut client = Client::connect(&read_state.connstr, NoTls)?;
|
||||
|
||||
handle_roles(&read_state.spec, &mut client)?;
|
||||
handle_databases(&read_state.spec, &mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
|
||||
info!(
|
||||
"finished configuration of cluster #{}",
|
||||
read_state.spec.cluster.cluster_id
|
||||
);
|
||||
|
||||
// Release the read lock.
|
||||
drop(read_state);
|
||||
|
||||
// Get the write lock, update state and release the lock, so HTTP API
|
||||
// was able to serve requests, while we are blocked waiting on
|
||||
// Postgres.
|
||||
let mut state = state.write().unwrap();
|
||||
state.ready = true;
|
||||
drop(state);
|
||||
|
||||
// Wait for child postgres process basically forever. In this state Ctrl+C
|
||||
// will be propagated to postgres and it will be shut down as well.
|
||||
let ecode = pg.wait().expect("failed to wait on postgres");
|
||||
|
||||
Ok(ecode)
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// TODO: re-use `zenith_utils::logging` later
|
||||
init_logger(DEFAULT_LOG_LEVEL)?;
|
||||
|
||||
// Env variable is set by `cargo`
|
||||
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
|
||||
let matches = clap::App::new("zenith_ctl")
|
||||
.version(version.unwrap_or("unknown"))
|
||||
.arg(
|
||||
Arg::new("connstr")
|
||||
.short('C')
|
||||
.long("connstr")
|
||||
.value_name("DATABASE_URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgdata")
|
||||
.short('D')
|
||||
.long("pgdata")
|
||||
.value_name("DATADIR")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pgbin")
|
||||
.short('b')
|
||||
.long("pgbin")
|
||||
.value_name("POSTGRES_PATH"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec")
|
||||
.short('s')
|
||||
.long("spec")
|
||||
.value_name("SPEC_JSON"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("spec-path")
|
||||
.short('S')
|
||||
.long("spec-path")
|
||||
.value_name("SPEC_PATH"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
|
||||
let connstr = matches
|
||||
.value_of("connstr")
|
||||
.expect("Postgres connection string is required");
|
||||
let spec = matches.value_of("spec");
|
||||
let spec_path = matches.value_of("spec-path");
|
||||
|
||||
// Try to use just 'postgres' if no path is provided
|
||||
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
|
||||
|
||||
let spec: ClusterSpec = match spec {
|
||||
// First, try to get cluster spec from the cli argument
|
||||
Some(json) => serde_json::from_str(json)?,
|
||||
None => {
|
||||
// Second, try to read it from the file if path is provided
|
||||
if let Some(sp) = spec_path {
|
||||
let path = Path::new(sp);
|
||||
let file = File::open(path)?;
|
||||
serde_json::from_reader(file)?
|
||||
} else {
|
||||
panic!("cluster spec should be provided via --spec or --spec-path argument");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let compute_state = ComputeState {
|
||||
connstr: connstr.to_string(),
|
||||
pgdata: pgdata.to_string(),
|
||||
pgbin: pgbin.to_string(),
|
||||
spec,
|
||||
ready: false,
|
||||
last_active: Utc::now(),
|
||||
};
|
||||
let compute_state = Arc::new(RwLock::new(compute_state));
|
||||
|
||||
// Launch service threads first, so we were able to serve availability
|
||||
// requests, while configuration is still in progress.
|
||||
let mut _threads = vec![
|
||||
launch_http_server(&compute_state).expect("cannot launch compute monitor thread"),
|
||||
launch_monitor(&compute_state).expect("cannot launch http endpoint thread"),
|
||||
];
|
||||
|
||||
prepare_pgdata(&compute_state)?;
|
||||
|
||||
// Run compute (Postgres) and hang waiting on it. Panic if any error happens,
|
||||
// it will help us to trigger unwind and kill postmaster as well.
|
||||
match run_compute(&compute_state) {
|
||||
Ok(ec) => exit(ec.success() as i32),
|
||||
Err(error) => panic!("cannot start compute node, error: {}", error),
|
||||
}
|
||||
}
|
||||
@@ -1,46 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use log::error;
|
||||
use postgres::Client;
|
||||
use tokio_postgres::NoTls;
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
|
||||
pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
|
||||
let query = "
|
||||
CREATE TABLE IF NOT EXISTS health_check (
|
||||
id serial primary key,
|
||||
updated_at timestamptz default now()
|
||||
);
|
||||
INSERT INTO health_check VALUES (1, now())
|
||||
ON CONFLICT (id) DO UPDATE
|
||||
SET updated_at = now();";
|
||||
let result = client.simple_query(query)?;
|
||||
if result.len() < 2 {
|
||||
return Err(anyhow::format_err!("executed {} queries", result.len()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
|
||||
let connstr = &compute.connstr;
|
||||
let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
|
||||
if client.is_closed() {
|
||||
return Err(anyhow!("connection to postgres closed"));
|
||||
}
|
||||
tokio::spawn(async move {
|
||||
if let Err(e) = connection.await {
|
||||
error!("connection error: {}", e);
|
||||
}
|
||||
});
|
||||
|
||||
let result = client
|
||||
.simple_query("UPDATE health_check SET updated_at = now() WHERE id = 1;")
|
||||
.await?;
|
||||
|
||||
if result.len() != 1 {
|
||||
return Err(anyhow!("statement can't be executed"));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,321 +0,0 @@
|
||||
//
|
||||
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
|
||||
// but there are several things that makes `PostgresNode` usage inconvenient in the
|
||||
// cloud:
|
||||
// - it inherits from `LocalEnv`, which contains **all-all** the information about
|
||||
// a complete service running
|
||||
// - it uses `PageServerNode` with information about http endpoint, which we do not
|
||||
// need in the cloud again
|
||||
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
|
||||
//
|
||||
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
|
||||
// attributes (not required for the cloud). Yet, it is still tempting to unify these
|
||||
// `PostgresNode` and `ComputeNode` and use one in both places.
|
||||
//
|
||||
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
|
||||
//
|
||||
use std::fs;
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::{Command, ExitStatus, Stdio};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::RwLock;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Utc};
|
||||
use log::info;
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::{Serialize, Serializer};
|
||||
|
||||
use crate::checker::create_writablity_check_data;
|
||||
use crate::config;
|
||||
use crate::pg_helpers::*;
|
||||
use crate::spec::*;
|
||||
|
||||
/// Compute node info shared across several `compute_ctl` threads.
|
||||
pub struct ComputeNode {
|
||||
pub start_time: DateTime<Utc>,
|
||||
pub connstr: String,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub spec: ComputeSpec,
|
||||
pub tenant: String,
|
||||
pub timeline: String,
|
||||
pub pageserver_connstr: String,
|
||||
pub metrics: ComputeMetrics,
|
||||
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
|
||||
/// to allow HTTP API server to serve status requests, while configuration
|
||||
/// is in progress.
|
||||
pub state: RwLock<ComputeState>,
|
||||
}
|
||||
|
||||
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: Serializer,
|
||||
{
|
||||
x.to_rfc3339().serialize(s)
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct ComputeState {
|
||||
pub status: ComputeStatus,
|
||||
/// Timestamp of the last Postgres activity
|
||||
#[serde(serialize_with = "rfc3339_serialize")]
|
||||
pub last_active: DateTime<Utc>,
|
||||
pub error: Option<String>,
|
||||
}
|
||||
|
||||
impl ComputeState {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
status: ComputeStatus::Init,
|
||||
last_active: Utc::now(),
|
||||
error: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeState {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ComputeStatus {
|
||||
Init,
|
||||
Running,
|
||||
Failed,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct ComputeMetrics {
|
||||
pub sync_safekeepers_ms: AtomicU64,
|
||||
pub basebackup_ms: AtomicU64,
|
||||
pub config_ms: AtomicU64,
|
||||
pub total_startup_ms: AtomicU64,
|
||||
}
|
||||
|
||||
impl ComputeMetrics {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
sync_safekeepers_ms: AtomicU64::new(0),
|
||||
basebackup_ms: AtomicU64::new(0),
|
||||
config_ms: AtomicU64::new(0),
|
||||
total_startup_ms: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ComputeMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ComputeNode {
|
||||
pub fn set_status(&self, status: ComputeStatus) {
|
||||
self.state.write().unwrap().status = status;
|
||||
}
|
||||
|
||||
pub fn get_status(&self) -> ComputeStatus {
|
||||
self.state.read().unwrap().status
|
||||
}
|
||||
|
||||
// Remove `pgdata` directory and create it again with right permissions.
|
||||
fn create_pgdata(&self) -> Result<()> {
|
||||
// Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
|
||||
// If it is something different then create_dir() will error out anyway.
|
||||
let _ok = fs::remove_dir_all(&self.pgdata);
|
||||
fs::create_dir(&self.pgdata)?;
|
||||
fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
fn get_basebackup(&self, lsn: &str) -> Result<()> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
|
||||
self.metrics.basebackup_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
// and return the reported LSN back to the caller.
|
||||
fn sync_safekeepers(&self) -> Result<String> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let sync_handle = Command::new(&self.pgbin)
|
||||
.args(&["--sync-safekeepers"])
|
||||
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}",
|
||||
sync_output.status,
|
||||
);
|
||||
}
|
||||
|
||||
self.metrics.sync_safekeepers_ms.store(
|
||||
Utc::now()
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
|
||||
/// Do all the preparations like PGDATA directory creation, configuration,
|
||||
/// safekeepers sync, basebackup, etc.
|
||||
pub fn prepare_pgdata(&self) -> Result<()> {
|
||||
let spec = &self.spec;
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Remove/create an empty pgdata directory and put configuration there.
|
||||
self.create_pgdata()?;
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
|
||||
info!("starting safekeepers syncing");
|
||||
let lsn = self
|
||||
.sync_safekeepers()
|
||||
.with_context(|| "failed to sync safekeepers")?;
|
||||
info!("safekeepers synced at LSN {}", lsn);
|
||||
|
||||
info!(
|
||||
"getting basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
);
|
||||
self.get_basebackup(&lsn).with_context(|| {
|
||||
format!(
|
||||
"failed to get basebackup@{} from pageserver {}",
|
||||
lsn, &self.pageserver_connstr
|
||||
)
|
||||
})?;
|
||||
|
||||
// Update pg_hba.conf received with basebackup.
|
||||
update_pg_hba(pgdata_path)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start Postgres as a child process and manage DBs/roles.
|
||||
/// After that this will hang waiting on the postmaster process to exit.
|
||||
pub fn run(&self) -> Result<ExitStatus> {
|
||||
let start_time = Utc::now();
|
||||
|
||||
let pgdata_path = Path::new(&self.pgdata);
|
||||
|
||||
// Run postgres as a child process.
|
||||
let mut pg = Command::new(&self.pgbin)
|
||||
.args(&["-D", &self.pgdata])
|
||||
.spawn()
|
||||
.expect("cannot start postgres process");
|
||||
|
||||
// Try default Postgres port if it is not provided
|
||||
let port = self
|
||||
.spec
|
||||
.cluster
|
||||
.settings
|
||||
.find("port")
|
||||
.unwrap_or_else(|| "5432".to_string());
|
||||
wait_for_postgres(&mut pg, &port, pgdata_path)?;
|
||||
|
||||
let mut client = Client::connect(&self.connstr, NoTls)?;
|
||||
|
||||
handle_roles(&self.spec, &mut client)?;
|
||||
handle_databases(&self.spec, &mut client)?;
|
||||
handle_grants(&self.spec, &mut client)?;
|
||||
create_writablity_check_data(&mut client)?;
|
||||
|
||||
// 'Close' connection
|
||||
drop(client);
|
||||
let startup_end_time = Utc::now();
|
||||
|
||||
self.metrics.config_ms.store(
|
||||
startup_end_time
|
||||
.signed_duration_since(start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
self.metrics.total_startup_ms.store(
|
||||
startup_end_time
|
||||
.signed_duration_since(self.start_time)
|
||||
.to_std()
|
||||
.unwrap()
|
||||
.as_millis() as u64,
|
||||
Ordering::Relaxed,
|
||||
);
|
||||
|
||||
self.set_status(ComputeStatus::Running);
|
||||
|
||||
info!(
|
||||
"finished configuration of compute for project {}",
|
||||
self.spec.cluster.cluster_id
|
||||
);
|
||||
|
||||
// Wait for child Postgres process basically forever. In this state Ctrl+C
|
||||
// will propagate to Postgres and it will be shut down as well.
|
||||
let ecode = pg
|
||||
.wait()
|
||||
.expect("failed to start waiting on Postgres process");
|
||||
|
||||
Ok(ecode)
|
||||
}
|
||||
|
||||
pub fn prepare_and_run(&self) -> Result<ExitStatus> {
|
||||
info!(
|
||||
"starting compute for project {}, operation {}, tenant {}, timeline {}",
|
||||
self.spec.cluster.cluster_id,
|
||||
self.spec.operation_uuid.as_ref().unwrap(),
|
||||
self.tenant,
|
||||
self.timeline,
|
||||
);
|
||||
|
||||
self.prepare_pgdata()?;
|
||||
self.run()
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,7 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::pg_helpers::PgOptionsSerialize;
|
||||
use crate::spec::ComputeSpec;
|
||||
use crate::zenith::ClusterSpec;
|
||||
|
||||
/// Check that `line` is inside a text file and put it there if it is not.
|
||||
/// Create file if it doesn't exist.
|
||||
@@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
|
||||
}
|
||||
|
||||
/// Create or completely rewrite configuration file specified by `path`
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
|
||||
pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> {
|
||||
// File::create() destroys the file content if it exists.
|
||||
let mut postgres_conf = File::create(path)?;
|
||||
|
||||
write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
|
||||
write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Write Postgres config block wrapped with generated comment section
|
||||
fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
|
||||
writeln!(file, "# Managed by compute_ctl: begin")?;
|
||||
fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> {
|
||||
writeln!(file, "# Managed by Zenith: begin")?;
|
||||
writeln!(file, "{}", buf)?;
|
||||
writeln!(file, "# Managed by compute_ctl: end")?;
|
||||
writeln!(file, "# Managed by Zenith: end")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,109 +0,0 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use log::{error, info};
|
||||
use serde_json;
|
||||
|
||||
use crate::compute::{ComputeNode, ComputeStatus};
|
||||
|
||||
// Service function to handle all available routes.
|
||||
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Timestamp of the last Postgres activity in the plain text.
|
||||
// DEPRECATED in favour of /status
|
||||
(&Method::GET, "/last_activity") => {
|
||||
info!("serving /last_active GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
|
||||
// Use RFC3339 format for consistency.
|
||||
Response::new(Body::from(state.last_active.to_rfc3339()))
|
||||
}
|
||||
|
||||
// Has compute setup process finished? -> true/false.
|
||||
// DEPRECATED in favour of /status
|
||||
(&Method::GET, "/ready") => {
|
||||
info!("serving /ready GET request");
|
||||
let status = compute.get_status();
|
||||
Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
|
||||
}
|
||||
|
||||
// Serialized compute state.
|
||||
(&Method::GET, "/status") => {
|
||||
info!("serving /status GET request");
|
||||
let state = compute.state.read().unwrap();
|
||||
Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
|
||||
}
|
||||
|
||||
// Startup metrics in JSON format. Keep /metrics reserved for a possible
|
||||
// future use for Prometheus metrics format.
|
||||
(&Method::GET, "/metrics.json") => {
|
||||
info!("serving /metrics.json GET request");
|
||||
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
|
||||
}
|
||||
|
||||
// DEPRECATED, use POST instead
|
||||
(&Method::GET, "/check_writability") => {
|
||||
info!("serving /check_writability GET request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
(&Method::POST, "/check_writability") => {
|
||||
info!("serving /check_writability POST request");
|
||||
let res = crate::checker::check_writability(&compute).await;
|
||||
match res {
|
||||
Ok(_) => Response::new(Body::from("true")),
|
||||
Err(e) => Response::new(Body::from(e.to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(state: Arc<ComputeNode>) {
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||
let state = state.clone();
|
||||
async move { Ok::<_, Infallible>(routes(req, state).await) }
|
||||
}))
|
||||
}
|
||||
});
|
||||
|
||||
info!("starting HTTP server on {}", addr);
|
||||
|
||||
let server = Server::bind(&addr).serve(make_service);
|
||||
|
||||
// Run this server forever
|
||||
if let Err(e) = server.await {
|
||||
error!("server error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-endpoint".into())
|
||||
.spawn(move || serve(state))?)
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
pub mod api;
|
||||
@@ -1,158 +0,0 @@
|
||||
openapi: "3.0.2"
|
||||
info:
|
||||
title: Compute node control API
|
||||
version: "1.0"
|
||||
|
||||
servers:
|
||||
- url: "http://localhost:3080"
|
||||
|
||||
paths:
|
||||
/status:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get compute node internal status
|
||||
description: ""
|
||||
operationId: getComputeStatus
|
||||
responses:
|
||||
"200":
|
||||
description: ComputeState
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeState"
|
||||
|
||||
/metrics.json:
|
||||
get:
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get compute node startup metrics in JSON format
|
||||
description: ""
|
||||
operationId: getComputeMetricsJSON
|
||||
responses:
|
||||
"200":
|
||||
description: ComputeMetrics
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ComputeMetrics"
|
||||
|
||||
/ready:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Check whether compute startup process finished successfully
|
||||
description: ""
|
||||
operationId: computeIsReady
|
||||
responses:
|
||||
"200":
|
||||
description: Compute is ready ('true') or not ('false')
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "true"
|
||||
|
||||
/last_activity:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "info"
|
||||
summary: Get timestamp of the last compute activity
|
||||
description: ""
|
||||
operationId: getLastComputeActivityTS
|
||||
responses:
|
||||
"200":
|
||||
description: Timestamp of the last compute activity
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
|
||||
/check_writability:
|
||||
get:
|
||||
deprecated: true
|
||||
tags:
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritabilityDeprecated
|
||||
responses:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
example: "true"
|
||||
|
||||
post:
|
||||
tags:
|
||||
- "check"
|
||||
summary: Check that we can write new data on this compute
|
||||
description: ""
|
||||
operationId: checkComputeWritability
|
||||
responses:
|
||||
"200":
|
||||
description: Check result
|
||||
content:
|
||||
text/plain:
|
||||
schema:
|
||||
type: string
|
||||
description: Error text or 'true' if check passed
|
||||
example: "true"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
type: http
|
||||
scheme: bearer
|
||||
bearerFormat: JWT
|
||||
|
||||
schemas:
|
||||
ComputeMetrics:
|
||||
type: object
|
||||
description: Compute startup metrics
|
||||
required:
|
||||
- sync_safekeepers_ms
|
||||
- basebackup_ms
|
||||
- config_ms
|
||||
- total_startup_ms
|
||||
properties:
|
||||
sync_safekeepers_ms:
|
||||
type: integer
|
||||
basebackup_ms:
|
||||
type: integer
|
||||
config_ms:
|
||||
type: integer
|
||||
total_startup_ms:
|
||||
type: integer
|
||||
|
||||
ComputeState:
|
||||
type: object
|
||||
required:
|
||||
- status
|
||||
- last_active
|
||||
properties:
|
||||
status:
|
||||
$ref: '#/components/schemas/ComputeStatus'
|
||||
last_active:
|
||||
type: string
|
||||
description: The last detected compute activity timestamp in UTC and RFC3339 format
|
||||
example: "2022-10-12T07:20:50.52Z"
|
||||
error:
|
||||
type: string
|
||||
description: Text of the error during compute startup, if any
|
||||
|
||||
ComputeStatus:
|
||||
type: string
|
||||
enum:
|
||||
- init
|
||||
- failed
|
||||
- running
|
||||
|
||||
security:
|
||||
- JWT: []
|
||||
73
compute_tools/src/http_api.rs
Normal file
73
compute_tools/src/http_api.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::convert::Infallible;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
|
||||
use anyhow::Result;
|
||||
use hyper::service::{make_service_fn, service_fn};
|
||||
use hyper::{Body, Method, Request, Response, Server, StatusCode};
|
||||
use log::{error, info};
|
||||
|
||||
use crate::zenith::*;
|
||||
|
||||
// Service function to handle all available routes.
|
||||
fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
|
||||
match (req.method(), req.uri().path()) {
|
||||
// Timestamp of the last Postgres activity in the plain text.
|
||||
(&Method::GET, "/last_activity") => {
|
||||
info!("serving /last_active GET request");
|
||||
let state = state.read().unwrap();
|
||||
|
||||
// Use RFC3339 format for consistency.
|
||||
Response::new(Body::from(state.last_active.to_rfc3339()))
|
||||
}
|
||||
|
||||
// Has compute setup process finished? -> true/false
|
||||
(&Method::GET, "/ready") => {
|
||||
info!("serving /ready GET request");
|
||||
let state = state.read().unwrap();
|
||||
Response::new(Body::from(format!("{}", state.ready)))
|
||||
}
|
||||
|
||||
// Return the `404 Not Found` for any other routes.
|
||||
_ => {
|
||||
let mut not_found = Response::new(Body::from("404 Not Found"));
|
||||
*not_found.status_mut() = StatusCode::NOT_FOUND;
|
||||
not_found
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
|
||||
#[tokio::main]
|
||||
async fn serve(state: Arc<RwLock<ComputeState>>) {
|
||||
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
|
||||
|
||||
let make_service = make_service_fn(move |_conn| {
|
||||
let state = state.clone();
|
||||
async move {
|
||||
Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
|
||||
let state = state.clone();
|
||||
async move { Ok::<_, Infallible>(routes(req, state)) }
|
||||
}))
|
||||
}
|
||||
});
|
||||
|
||||
info!("starting HTTP server on {}", addr);
|
||||
|
||||
let server = Server::bind(&addr).serve(make_service);
|
||||
|
||||
// Run this server forever
|
||||
if let Err(e) = server.await {
|
||||
error!("server error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
|
||||
pub fn launch_http_server(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
.name("http-endpoint".into())
|
||||
.spawn(move || serve(state))?)
|
||||
}
|
||||
@@ -2,13 +2,12 @@
|
||||
//! Various tools and helpers to handle cluster / compute node (Postgres)
|
||||
//! configuration.
|
||||
//!
|
||||
pub mod checker;
|
||||
pub mod config;
|
||||
pub mod http;
|
||||
pub mod http_api;
|
||||
#[macro_use]
|
||||
pub mod logger;
|
||||
pub mod compute;
|
||||
pub mod monitor;
|
||||
pub mod params;
|
||||
pub mod pg_helpers;
|
||||
pub mod spec;
|
||||
pub mod zenith;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::{thread, time};
|
||||
|
||||
use anyhow::Result;
|
||||
@@ -6,16 +6,16 @@ use chrono::{DateTime, Utc};
|
||||
use log::{debug, info};
|
||||
use postgres::{Client, NoTls};
|
||||
|
||||
use crate::compute::ComputeNode;
|
||||
use crate::zenith::ComputeState;
|
||||
|
||||
const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
|
||||
|
||||
// Spin in a loop and figure out the last activity time in the Postgres.
|
||||
// Then update it in the shared state. This function never errors out.
|
||||
// XXX: the only expected panic is at `RwLock` unwrap().
|
||||
fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
|
||||
// Suppose that `connstr` doesn't change
|
||||
let connstr = compute.connstr.clone();
|
||||
let connstr = state.read().unwrap().connstr.clone();
|
||||
// Define `client` outside of the loop to reuse existing connection if it's active.
|
||||
let mut client = Client::connect(&connstr, NoTls);
|
||||
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
|
||||
@@ -46,7 +46,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors?
|
||||
&[],
|
||||
);
|
||||
let mut last_active = compute.state.read().unwrap().last_active;
|
||||
let mut last_active = state.read().unwrap().last_active;
|
||||
|
||||
if let Ok(backs) = backends {
|
||||
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
|
||||
@@ -83,14 +83,14 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
|
||||
// Update the last activity in the shared state if we got a more recent one.
|
||||
let mut state = compute.state.write().unwrap();
|
||||
let mut state = state.write().unwrap();
|
||||
if last_active > state.last_active {
|
||||
state.last_active = last_active;
|
||||
debug!("set the last compute activity time to: {}", last_active);
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!("cannot connect to postgres: {}, retrying", e);
|
||||
info!("cannot connect to postgres: {}, retrying", e);
|
||||
|
||||
// Establish a new connection and try again.
|
||||
client = Client::connect(&connstr, NoTls);
|
||||
@@ -100,7 +100,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
|
||||
}
|
||||
|
||||
/// Launch a separate compute monitor thread and return its `JoinHandle`.
|
||||
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
|
||||
pub fn launch_monitor(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
|
||||
let state = Arc::clone(state);
|
||||
|
||||
Ok(thread::Builder::new()
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
use std::net::{SocketAddr, TcpStream};
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
use std::path::Path;
|
||||
use std::process::Child;
|
||||
use std::process::Command;
|
||||
use std::str::FromStr;
|
||||
use std::{fs, thread, time};
|
||||
|
||||
@@ -134,14 +132,7 @@ impl Role {
|
||||
let mut params: String = "LOGIN".to_string();
|
||||
|
||||
if let Some(pass) = &self.encrypted_password {
|
||||
// Some time ago we supported only md5 and treated all encrypted_password as md5.
|
||||
// Now we also support SCRAM-SHA-256 and to preserve compatibility
|
||||
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
|
||||
if pass.starts_with("SCRAM-SHA-256") {
|
||||
params.push_str(&format!(" PASSWORD '{}'", pass));
|
||||
} else {
|
||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
||||
}
|
||||
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
|
||||
} else {
|
||||
params.push_str(" PASSWORD NULL");
|
||||
}
|
||||
@@ -222,12 +213,12 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
|
||||
/// Wait for Postgres to become ready to accept connections:
|
||||
/// - state should be `ready` in the `pgdata/postmaster.pid`
|
||||
/// - and we should be able to connect to 127.0.0.1:5432
|
||||
pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
|
||||
pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
|
||||
let pid_path = pgdata.join("postmaster.pid");
|
||||
let mut slept: u64 = 0; // ms
|
||||
let pause = time::Duration::from_millis(100);
|
||||
|
||||
let timeout = time::Duration::from_millis(10);
|
||||
let timeout = time::Duration::from_millis(200);
|
||||
let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
|
||||
|
||||
loop {
|
||||
@@ -238,19 +229,14 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
|
||||
bail!("timed out while waiting for Postgres to start");
|
||||
}
|
||||
|
||||
if let Ok(Some(status)) = pg.try_wait() {
|
||||
// Postgres exited, that is not what we expected, bail out earlier.
|
||||
let code = status.code().unwrap_or(-1);
|
||||
bail!("Postgres exited unexpectedly with code {}", code);
|
||||
}
|
||||
|
||||
if pid_path.exists() {
|
||||
let file = BufReader::new(File::open(&pid_path)?);
|
||||
let status = file
|
||||
.lines()
|
||||
.last()
|
||||
.unwrap()
|
||||
.unwrap_or_else(|_| "unknown".to_string());
|
||||
// XXX: dumb and the simplest way to get the last line in a text file
|
||||
// TODO: better use `.lines().last()` later
|
||||
let stdout = Command::new("tail")
|
||||
.args(&["-n1", pid_path.to_str().unwrap()])
|
||||
.output()?
|
||||
.stdout;
|
||||
let status = String::from_utf8(stdout)?;
|
||||
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
|
||||
|
||||
// Now Postgres is ready to accept connections
|
||||
|
||||
@@ -3,53 +3,16 @@ use std::path::Path;
|
||||
use anyhow::Result;
|
||||
use log::{info, log_enabled, warn, Level};
|
||||
use postgres::Client;
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::config;
|
||||
use crate::params::PG_HBA_ALL_MD5;
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct ComputeSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
}
|
||||
|
||||
/// Cluster state seen from the perspective of the external tools
|
||||
/// like Rails web console.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
}
|
||||
use crate::zenith::ClusterSpec;
|
||||
|
||||
/// It takes cluster specification and does the following:
|
||||
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
|
||||
/// - Update `pg_hba.conf` to allow external connections.
|
||||
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
|
||||
pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> {
|
||||
// File `postgresql.conf` is no longer included into `basebackup`, so just
|
||||
// always write all config into it creating new file.
|
||||
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
|
||||
@@ -76,7 +39,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
|
||||
|
||||
/// Given a cluster spec json and open transaction it handles roles creation,
|
||||
/// deletion and update.
|
||||
pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
let mut xact = client.transaction()?;
|
||||
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
|
||||
|
||||
@@ -173,20 +136,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
}
|
||||
} else {
|
||||
info!("role name: '{}'", &name);
|
||||
info!("role name {}", &name);
|
||||
let mut query: String = format!("CREATE ROLE {} ", name.quote());
|
||||
info!("role create query: '{}'", &query);
|
||||
info!("role create query {}", &query);
|
||||
info_print!(" -> create");
|
||||
|
||||
query.push_str(&role.to_pg_options());
|
||||
xact.execute(query.as_str(), &[])?;
|
||||
|
||||
let grant_query = format!(
|
||||
"grant pg_read_all_data, pg_write_all_data to {}",
|
||||
name.quote()
|
||||
);
|
||||
xact.execute(grant_query.as_str(), &[])?;
|
||||
info!("role grant query: '{}'", &grant_query);
|
||||
}
|
||||
|
||||
info_print!("\n");
|
||||
@@ -202,7 +158,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
|
||||
/// atomicity should be enough here due to the order of operations and various checks,
|
||||
/// which together provide us idempotency.
|
||||
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
|
||||
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
|
||||
|
||||
// Print a list of existing Postgres databases (only in debug mode)
|
||||
@@ -288,24 +244,3 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Grant CREATE ON DATABASE to the database owner
|
||||
// to allow clients create trusted extensions.
|
||||
pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
|
||||
info!("cluster spec grants:");
|
||||
|
||||
for db in &spec.cluster.databases {
|
||||
let dbname = &db.name;
|
||||
|
||||
let query: String = format!(
|
||||
"GRANT CREATE ON DATABASE {} TO {}",
|
||||
dbname.quote(),
|
||||
db.owner.quote()
|
||||
);
|
||||
info!("grant query {}", &query);
|
||||
|
||||
client.execute(query.as_str(), &[])?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
109
compute_tools/src/zenith.rs
Normal file
109
compute_tools/src/zenith.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use std::process::{Command, Stdio};
|
||||
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use postgres::{Client, NoTls};
|
||||
use serde::Deserialize;
|
||||
|
||||
use crate::pg_helpers::*;
|
||||
|
||||
/// Compute node state shared across several `zenith_ctl` threads.
|
||||
/// Should be used under `RwLock` to allow HTTP API server to serve
|
||||
/// status requests, while configuration is in progress.
|
||||
pub struct ComputeState {
|
||||
pub connstr: String,
|
||||
pub pgdata: String,
|
||||
pub pgbin: String,
|
||||
pub spec: ClusterSpec,
|
||||
/// Compute setup process has finished
|
||||
pub ready: bool,
|
||||
/// Timestamp of the last Postgres activity
|
||||
pub last_active: DateTime<Utc>,
|
||||
}
|
||||
|
||||
/// Cluster spec or configuration represented as an optional number of
|
||||
/// delta operations + final cluster state description.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct ClusterSpec {
|
||||
pub format_version: f32,
|
||||
pub timestamp: String,
|
||||
pub operation_uuid: Option<String>,
|
||||
/// Expected cluster state at the end of transition process.
|
||||
pub cluster: Cluster,
|
||||
pub delta_operations: Option<Vec<DeltaOp>>,
|
||||
}
|
||||
|
||||
/// Cluster state seen from the perspective of the external tools
|
||||
/// like Rails web console.
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct Cluster {
|
||||
pub cluster_id: String,
|
||||
pub name: String,
|
||||
pub state: Option<String>,
|
||||
pub roles: Vec<Role>,
|
||||
pub databases: Vec<Database>,
|
||||
pub settings: GenericOptions,
|
||||
}
|
||||
|
||||
/// Single cluster state changing operation that could not be represented as
|
||||
/// a static `Cluster` structure. For example:
|
||||
/// - DROP DATABASE
|
||||
/// - DROP ROLE
|
||||
/// - ALTER ROLE name RENAME TO new_name
|
||||
/// - ALTER DATABASE name RENAME TO new_name
|
||||
#[derive(Clone, Deserialize)]
|
||||
pub struct DeltaOp {
|
||||
pub action: String,
|
||||
pub name: PgIdent,
|
||||
pub new_name: Option<PgIdent>,
|
||||
}
|
||||
|
||||
/// Get basebackup from the libpq connection to pageserver using `connstr` and
|
||||
/// unarchive it to `pgdata` directory overriding all its previous content.
|
||||
pub fn get_basebackup(
|
||||
pgdata: &str,
|
||||
connstr: &str,
|
||||
tenant: &str,
|
||||
timeline: &str,
|
||||
lsn: &str,
|
||||
) -> Result<()> {
|
||||
let mut client = Client::connect(connstr, NoTls)?;
|
||||
let basebackup_cmd = match lsn {
|
||||
"0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute
|
||||
_ => format!("basebackup {} {} {}", tenant, timeline, lsn),
|
||||
};
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
|
||||
ar.unpack(&pgdata)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Run `postgres` in a special mode with `--sync-safekeepers` argument
|
||||
/// and return the reported LSN back to the caller.
|
||||
pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result<String> {
|
||||
let sync_handle = Command::new(&pgbin)
|
||||
.args(&["--sync-safekeepers"])
|
||||
.env("PGDATA", &pgdata) // we cannot use -D in this mode
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.expect("postgres --sync-safekeepers failed to start");
|
||||
|
||||
// `postgres --sync-safekeepers` will print all log output to stderr and
|
||||
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
|
||||
// redirected to the caller output.
|
||||
let sync_output = sync_handle
|
||||
.wait_with_output()
|
||||
.expect("postgres --sync-safekeepers failed");
|
||||
if !sync_output.status.success() {
|
||||
anyhow::bail!(
|
||||
"postgres --sync-safekeepers exited with non-zero status: {}",
|
||||
sync_output.status,
|
||||
);
|
||||
}
|
||||
|
||||
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
|
||||
|
||||
Ok(lsn)
|
||||
}
|
||||
@@ -4,12 +4,12 @@ mod pg_helpers_tests {
|
||||
use std::fs::File;
|
||||
|
||||
use compute_tools::pg_helpers::*;
|
||||
use compute_tools::spec::ComputeSpec;
|
||||
use compute_tools::zenith::ClusterSpec;
|
||||
|
||||
#[test]
|
||||
fn params_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.databases.first().unwrap().to_pg_options(),
|
||||
@@ -24,7 +24,7 @@ mod pg_helpers_tests {
|
||||
#[test]
|
||||
fn settings_serialize() {
|
||||
let file = File::open("tests/cluster_spec.json").unwrap();
|
||||
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
|
||||
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
spec.cluster.settings.as_pg_settings(),
|
||||
|
||||
@@ -4,8 +4,8 @@ version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
tar = "0.4.38"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tar = "0.4.33"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_with = "1.12.0"
|
||||
toml = "0.5"
|
||||
@@ -18,6 +18,6 @@ url = "2.2.2"
|
||||
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
|
||||
|
||||
pageserver = { path = "../pageserver" }
|
||||
safekeeper = { path = "../safekeeper" }
|
||||
utils = { path = "../libs/utils" }
|
||||
walkeeper = { path = "../walkeeper" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
@@ -9,6 +9,3 @@ auth_type = 'Trust'
|
||||
id = 1
|
||||
pg_port = 5454
|
||||
http_port = 7676
|
||||
|
||||
[etcd_broker]
|
||||
broker_endpoints = ['http://127.0.0.1:2379']
|
||||
|
||||
@@ -11,12 +11,11 @@ use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use utils::{
|
||||
connstring::connection_host_port,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::connstring::connection_host_port;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::postgresql_conf::PostgresConf;
|
||||
@@ -231,13 +230,8 @@ impl PostgresNode {
|
||||
.context("page server 'basebackup' command failed")?;
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(copyreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata())
|
||||
tar::Archive::new(copyreader)
|
||||
.unpack(&self.pgdata())
|
||||
.context("extracting base backup failed")?;
|
||||
|
||||
Ok(())
|
||||
@@ -278,9 +272,12 @@ impl PostgresNode {
|
||||
conf.append("wal_sender_timeout", "5s");
|
||||
conf.append("listen_addresses", &self.address.ip().to_string());
|
||||
conf.append("port", &self.address.port().to_string());
|
||||
conf.append("wal_keep_size", "0");
|
||||
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
|
||||
conf.append("restart_after_crash", "off");
|
||||
|
||||
// Never clean up old WAL. TODO: We should use a replication
|
||||
// slot or something proper, to prevent the compute node
|
||||
// from removing WAL that hasn't been streamed to the safekeeper or
|
||||
// page server yet. (gh issue #349)
|
||||
conf.append("wal_keep_size", "10TB");
|
||||
|
||||
// Configure the node to fetch pages from pageserver
|
||||
let pageserver_connstr = {
|
||||
@@ -334,14 +331,14 @@ impl PostgresNode {
|
||||
// Configure the node to connect to the safekeepers
|
||||
conf.append("synchronous_standby_names", "walproposer");
|
||||
|
||||
let safekeepers = self
|
||||
let wal_acceptors = self
|
||||
.env
|
||||
.safekeepers
|
||||
.iter()
|
||||
.map(|sk| format!("localhost:{}", sk.pg_port))
|
||||
.collect::<Vec<String>>()
|
||||
.join(",");
|
||||
conf.append("wal_acceptors", &safekeepers);
|
||||
conf.append("wal_acceptors", &wal_acceptors);
|
||||
} else {
|
||||
// We only use setup without safekeepers for tests,
|
||||
// and don't care about data durability on pageserver,
|
||||
@@ -423,15 +420,10 @@ impl PostgresNode {
|
||||
if let Some(token) = auth_token {
|
||||
cmd.env("ZENITH_AUTH_TOKEN", token);
|
||||
}
|
||||
let pg_ctl = cmd.status().context("pg_ctl failed")?;
|
||||
|
||||
let pg_ctl = cmd.output().context("pg_ctl failed")?;
|
||||
if !pg_ctl.status.success() {
|
||||
anyhow::bail!(
|
||||
"pg_ctl failed, exit code: {}, stdout: {}, stderr: {}",
|
||||
pg_ctl.status,
|
||||
String::from_utf8_lossy(&pg_ctl.stdout),
|
||||
String::from_utf8_lossy(&pg_ctl.stderr),
|
||||
);
|
||||
if !pg_ctl.success() {
|
||||
anyhow::bail!("pg_ctl failed");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
use std::{
|
||||
fs,
|
||||
path::PathBuf,
|
||||
process::{Command, Stdio},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use nix::{
|
||||
sys::signal::{kill, Signal},
|
||||
unistd::Pid,
|
||||
};
|
||||
|
||||
use crate::{local_env, read_pidfile};
|
||||
|
||||
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_broker = &env.etcd_broker;
|
||||
println!(
|
||||
"Starting etcd broker using {}",
|
||||
etcd_broker.etcd_binary_path.display()
|
||||
);
|
||||
|
||||
let etcd_data_dir = env.base_data_dir.join("etcd");
|
||||
fs::create_dir_all(&etcd_data_dir).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd data dir: {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
let etcd_stdout_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create ectd stout file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let etcd_stderr_file =
|
||||
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
|
||||
format!(
|
||||
"Failed to create ectd stderr file in directory {}",
|
||||
etcd_data_dir.display()
|
||||
)
|
||||
})?;
|
||||
let client_urls = etcd_broker.comma_separated_endpoints();
|
||||
|
||||
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
|
||||
.args(&[
|
||||
format!("--data-dir={}", etcd_data_dir.display()),
|
||||
format!("--listen-client-urls={client_urls}"),
|
||||
format!("--advertise-client-urls={client_urls}"),
|
||||
// Set --quota-backend-bytes to keep the etcd virtual memory
|
||||
// size smaller. Our test etcd clusters are very small.
|
||||
// See https://github.com/etcd-io/etcd/issues/7910
|
||||
"--quota-backend-bytes=100000000".to_string(),
|
||||
])
|
||||
.stdout(Stdio::from(etcd_stdout_file))
|
||||
.stderr(Stdio::from(etcd_stderr_file))
|
||||
.spawn()
|
||||
.context("Failed to spawn etcd subprocess")?;
|
||||
let pid = etcd_process.id();
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
|
||||
format!(
|
||||
"Failed to create etcd pid file at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let etcd_path = &env.etcd_broker.etcd_binary_path;
|
||||
println!("Stopping etcd broker at {}", etcd_path.display());
|
||||
|
||||
let etcd_pid_file_path = etcd_pid_file_path(env);
|
||||
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
|
||||
format!(
|
||||
"Failed to read etcd pid filea at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?);
|
||||
|
||||
kill(pid, Signal::SIGTERM).with_context(|| {
|
||||
format!(
|
||||
"Failed to stop etcd with pid {pid} at {}",
|
||||
etcd_pid_file_path.display()
|
||||
)
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
|
||||
env.base_data_dir.join("etcd.pid")
|
||||
}
|
||||
@@ -12,7 +12,6 @@ use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
pub mod compute;
|
||||
pub mod etcd;
|
||||
pub mod local_env;
|
||||
pub mod postgresql_conf;
|
||||
pub mod safekeeper;
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
//! script which will use local paths.
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use std::collections::HashMap;
|
||||
@@ -12,11 +11,9 @@ use std::env;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use utils::{
|
||||
auth::{encode_from_key_file, Claims, Scope},
|
||||
postgres_backend::AuthType,
|
||||
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::auth::{encode_from_key_file, Claims, Scope};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use crate::safekeeper::SafekeeperNode;
|
||||
|
||||
@@ -60,7 +57,9 @@ pub struct LocalEnv {
|
||||
#[serde(default)]
|
||||
pub private_key_path: PathBuf,
|
||||
|
||||
pub etcd_broker: EtcdBroker,
|
||||
// A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
|
||||
#[serde(default)]
|
||||
pub broker_endpoints: Option<String>,
|
||||
|
||||
pub pageserver: PageServerConf,
|
||||
|
||||
@@ -76,67 +75,11 @@ pub struct LocalEnv {
|
||||
branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
|
||||
}
|
||||
|
||||
/// Etcd broker config for cluster internal communication.
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
pub struct EtcdBroker {
|
||||
/// A prefix to all to any key when pushing/polling etcd from a node.
|
||||
#[serde(default)]
|
||||
pub broker_etcd_prefix: Option<String>,
|
||||
|
||||
/// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Vec<DisplayFromStr>")]
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
|
||||
/// Etcd binary path to use.
|
||||
#[serde(default)]
|
||||
pub etcd_binary_path: PathBuf,
|
||||
}
|
||||
|
||||
impl EtcdBroker {
|
||||
pub fn locate_etcd() -> anyhow::Result<PathBuf> {
|
||||
let which_output = Command::new("which")
|
||||
.arg("etcd")
|
||||
.output()
|
||||
.context("Failed to run 'which etcd' command")?;
|
||||
let stdout = String::from_utf8_lossy(&which_output.stdout);
|
||||
ensure!(
|
||||
which_output.status.success(),
|
||||
"'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
|
||||
which_output.status,
|
||||
String::from_utf8_lossy(&which_output.stderr)
|
||||
);
|
||||
|
||||
let etcd_path = PathBuf::from(stdout.trim());
|
||||
ensure!(
|
||||
etcd_path.is_file(),
|
||||
"'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
|
||||
etcd_path.display()
|
||||
);
|
||||
|
||||
Ok(etcd_path)
|
||||
}
|
||||
|
||||
pub fn comma_separated_endpoints(&self) -> String {
|
||||
self.broker_endpoints.iter().map(Url::as_str).fold(
|
||||
String::new(),
|
||||
|mut comma_separated_urls, url| {
|
||||
if !comma_separated_urls.is_empty() {
|
||||
comma_separated_urls.push(',');
|
||||
}
|
||||
comma_separated_urls.push_str(url);
|
||||
comma_separated_urls
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct PageServerConf {
|
||||
// node id
|
||||
pub id: NodeId,
|
||||
pub id: ZNodeId,
|
||||
// Pageserver connection settings
|
||||
pub listen_pg_addr: String,
|
||||
pub listen_http_addr: String,
|
||||
@@ -151,7 +94,7 @@ pub struct PageServerConf {
|
||||
impl Default for PageServerConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id: NodeId(0),
|
||||
id: ZNodeId(0),
|
||||
listen_pg_addr: String::new(),
|
||||
listen_http_addr: String::new(),
|
||||
auth_type: AuthType::Trust,
|
||||
@@ -163,7 +106,7 @@ impl Default for PageServerConf {
|
||||
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
|
||||
#[serde(default)]
|
||||
pub struct SafekeeperConf {
|
||||
pub id: NodeId,
|
||||
pub id: ZNodeId,
|
||||
pub pg_port: u16,
|
||||
pub http_port: u16,
|
||||
pub sync: bool,
|
||||
@@ -172,7 +115,7 @@ pub struct SafekeeperConf {
|
||||
impl Default for SafekeeperConf {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
id: NodeId(0),
|
||||
id: ZNodeId(0),
|
||||
pg_port: 0,
|
||||
http_port: 0,
|
||||
sync: true,
|
||||
@@ -235,7 +178,12 @@ impl LocalEnv {
|
||||
if old_timeline_id == &timeline_id {
|
||||
Ok(())
|
||||
} else {
|
||||
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
|
||||
bail!(
|
||||
"branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
|
||||
branch_name,
|
||||
old_timeline_id,
|
||||
timeline_id
|
||||
);
|
||||
}
|
||||
} else {
|
||||
existing_values.push((tenant_id, timeline_id));
|
||||
@@ -271,7 +219,7 @@ impl LocalEnv {
|
||||
///
|
||||
/// Unlike 'load_config', this function fills in any defaults that are missing
|
||||
/// from the config file.
|
||||
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
|
||||
pub fn create_config(toml: &str) -> anyhow::Result<Self> {
|
||||
let mut env: LocalEnv = toml::from_str(toml)?;
|
||||
|
||||
// Find postgres binaries.
|
||||
@@ -284,11 +232,26 @@ impl LocalEnv {
|
||||
env.pg_distrib_dir = cwd.join("tmp_install")
|
||||
}
|
||||
}
|
||||
if !env.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
env.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
// Find zenith binaries.
|
||||
if env.zenith_distrib_dir == Path::new("") {
|
||||
env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !env.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
||||
binary,
|
||||
env.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// If no initial tenant ID was given, generate it.
|
||||
if env.default_tenant_id.is_none() {
|
||||
@@ -382,36 +345,6 @@ impl LocalEnv {
|
||||
"directory '{}' already exists. Perhaps already initialized?",
|
||||
base_path.display()
|
||||
);
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{}' in zenith distrib dir '{}'",
|
||||
binary,
|
||||
self.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
for binary in ["pageserver", "safekeeper"] {
|
||||
if !self.zenith_distrib_dir.join(binary).exists() {
|
||||
bail!(
|
||||
"Can't find binary '{binary}' in zenith distrib dir '{}'",
|
||||
self.zenith_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
}
|
||||
if !self.pg_distrib_dir.join("bin/postgres").exists() {
|
||||
bail!(
|
||||
"Can't find postgres binary at {}",
|
||||
self.pg_distrib_dir.display()
|
||||
);
|
||||
}
|
||||
|
||||
fs::create_dir(&base_path)?;
|
||||
|
||||
@@ -469,35 +402,7 @@ impl LocalEnv {
|
||||
|
||||
fn base_path() -> PathBuf {
|
||||
match std::env::var_os("ZENITH_REPO_DIR") {
|
||||
Some(val) => PathBuf::from(val),
|
||||
None => PathBuf::from(".zenith"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_conf_parsing() {
|
||||
let simple_conf_toml = include_str!("../simple.conf");
|
||||
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
|
||||
assert!(
|
||||
simple_conf_parse_result.is_ok(),
|
||||
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
|
||||
);
|
||||
|
||||
let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
|
||||
let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
|
||||
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
|
||||
assert!(
|
||||
spoiled_url_toml.contains(spoiled_url_str),
|
||||
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
|
||||
);
|
||||
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
|
||||
assert!(
|
||||
spoiled_url_parse_result.is_err(),
|
||||
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
|
||||
);
|
||||
Some(val) => PathBuf::from(val.to_str().unwrap()),
|
||||
None => ".zenith".into(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,17 +13,15 @@ use nix::unistd::Pid;
|
||||
use postgres::Config;
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use safekeeper::http::models::TimelineCreateRequest;
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
use walkeeper::http::models::TimelineCreateRequest;
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::local_env::{LocalEnv, SafekeeperConf};
|
||||
use crate::storage::PageServerNode;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SafekeeperHttpError {
|
||||
@@ -52,7 +50,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
Err(SafekeeperHttpError::Response(
|
||||
match self.json::<HttpErrorBody>() {
|
||||
Ok(err_body) => format!("Error: {}", err_body.msg),
|
||||
Err(_) => format!("Http error ({}) at {url}.", status.as_u16()),
|
||||
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
|
||||
},
|
||||
))
|
||||
}
|
||||
@@ -65,7 +63,7 @@ impl ResponseErrorMessageExt for Response {
|
||||
//
|
||||
#[derive(Debug)]
|
||||
pub struct SafekeeperNode {
|
||||
pub id: NodeId,
|
||||
pub id: ZNodeId,
|
||||
|
||||
pub conf: SafekeeperConf,
|
||||
|
||||
@@ -75,12 +73,16 @@ pub struct SafekeeperNode {
|
||||
pub http_base_url: String,
|
||||
|
||||
pub pageserver: Arc<PageServerNode>,
|
||||
|
||||
broker_endpoints: Option<String>,
|
||||
}
|
||||
|
||||
impl SafekeeperNode {
|
||||
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
|
||||
let pageserver = Arc::new(PageServerNode::from_env(env));
|
||||
|
||||
println!("initializing for sk {} for {}", conf.id, conf.http_port);
|
||||
|
||||
SafekeeperNode {
|
||||
id: conf.id,
|
||||
conf: conf.clone(),
|
||||
@@ -89,6 +91,7 @@ impl SafekeeperNode {
|
||||
http_client: Client::new(),
|
||||
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
|
||||
pageserver,
|
||||
broker_endpoints: env.broker_endpoints.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,7 +103,7 @@ impl SafekeeperNode {
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
|
||||
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf {
|
||||
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
|
||||
}
|
||||
|
||||
@@ -135,13 +138,8 @@ impl SafekeeperNode {
|
||||
if !self.conf.sync {
|
||||
cmd.arg("--no-sync");
|
||||
}
|
||||
|
||||
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
|
||||
if !comma_separated_endpoints.is_empty() {
|
||||
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
|
||||
}
|
||||
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
|
||||
cmd.args(&["--broker-etcd-prefix", prefix]);
|
||||
if let Some(ref ep) = self.broker_endpoints {
|
||||
cmd.args(&["--broker-endpoints", ep]);
|
||||
}
|
||||
|
||||
if !cmd.status()?.success() {
|
||||
@@ -205,13 +203,12 @@ impl SafekeeperNode {
|
||||
let pid = Pid::from_raw(pid);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping safekeeper {} immediately..", self.id);
|
||||
println!("Stop safekeeper immediately");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping safekeeper {} gracefully..", self.id);
|
||||
println!("Stop safekeeper gracefully");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
@@ -233,35 +230,25 @@ impl SafekeeperNode {
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if safekeeper flushes a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for _ in 0..100 {
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
if err.kind() != io::ErrorKind::ConnectionRefused {
|
||||
eprintln!("\nSafekeeper connection failed with error: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
if tcp_stopped {
|
||||
// Also check status on the HTTP port
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Safekeeper stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
Ok(_) => {
|
||||
println!("Safekeeper status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!("\nSafekeeper status check failed with error: {err}");
|
||||
println!("Safekeeper status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => {
|
||||
// keep waiting
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Safekeeper still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop safekeeper with pid {}", pid);
|
||||
@@ -286,7 +273,7 @@ impl SafekeeperNode {
|
||||
&self,
|
||||
tenant_id: ZTenantId,
|
||||
timeline_id: ZTimelineId,
|
||||
peer_ids: Vec<NodeId>,
|
||||
peer_ids: Vec<ZNodeId>,
|
||||
) -> Result<()> {
|
||||
Ok(self
|
||||
.http_request(
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
use std::net::TcpStream;
|
||||
use std::path::PathBuf;
|
||||
@@ -10,23 +9,21 @@ use anyhow::{bail, Context};
|
||||
use nix::errno::Errno;
|
||||
use nix::sys::signal::{kill, Signal};
|
||||
use nix::unistd::Pid;
|
||||
use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::http::models::{TenantCreateRequest, TimelineCreateRequest};
|
||||
use pageserver::timelines::TimelineInfo;
|
||||
use postgres::{Config, NoTls};
|
||||
use reqwest::blocking::{Client, RequestBuilder, Response};
|
||||
use reqwest::{IntoUrl, Method};
|
||||
use thiserror::Error;
|
||||
use utils::{
|
||||
connstring::connection_address,
|
||||
http::error::HttpErrorBody,
|
||||
lsn::Lsn,
|
||||
postgres_backend::AuthType,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::http::error::HttpErrorBody;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZTenantId, ZTimelineId};
|
||||
|
||||
use crate::local_env::LocalEnv;
|
||||
use crate::{fill_rust_env_vars, read_pidfile};
|
||||
use pageserver::tenant_mgr::TenantInfo;
|
||||
use zenith_utils::connstring::connection_address;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum PageserverHttpError {
|
||||
@@ -121,16 +118,6 @@ impl PageServerNode {
|
||||
);
|
||||
let listen_pg_addr_param =
|
||||
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
|
||||
let broker_endpoints_param = format!(
|
||||
"broker_endpoints=[{}]",
|
||||
self.env
|
||||
.etcd_broker
|
||||
.broker_endpoints
|
||||
.iter()
|
||||
.map(|url| format!("'{url}'"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(",")
|
||||
);
|
||||
let mut args = Vec::with_capacity(20);
|
||||
|
||||
args.push("--init");
|
||||
@@ -139,19 +126,8 @@ impl PageServerNode {
|
||||
args.extend(["-c", &authg_type_param]);
|
||||
args.extend(["-c", &listen_http_addr_param]);
|
||||
args.extend(["-c", &listen_pg_addr_param]);
|
||||
args.extend(["-c", &broker_endpoints_param]);
|
||||
args.extend(["-c", &id]);
|
||||
|
||||
let broker_etcd_prefix_param = self
|
||||
.env
|
||||
.etcd_broker
|
||||
.broker_etcd_prefix
|
||||
.as_ref()
|
||||
.map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
|
||||
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
|
||||
args.extend(["-c", broker_etcd_prefix_param]);
|
||||
}
|
||||
|
||||
for config_override in config_overrides {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
@@ -188,9 +164,6 @@ impl PageServerNode {
|
||||
);
|
||||
}
|
||||
|
||||
// echo the captured output of the init command
|
||||
println!("{}", String::from_utf8_lossy(&init_output.stdout));
|
||||
|
||||
Ok(initial_timeline_id)
|
||||
}
|
||||
|
||||
@@ -210,6 +183,8 @@ impl PageServerNode {
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
|
||||
let repo_path = self.repo_path();
|
||||
let mut args = vec!["-D", repo_path.to_str().unwrap()];
|
||||
|
||||
@@ -217,11 +192,9 @@ impl PageServerNode {
|
||||
args.extend(["-c", config_override]);
|
||||
}
|
||||
|
||||
let mut cmd = Command::new(self.env.pageserver_bin()?);
|
||||
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
filled_cmd = fill_aws_secrets_vars(filled_cmd);
|
||||
fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
|
||||
|
||||
if !filled_cmd.status()?.success() {
|
||||
if !cmd.status()?.success() {
|
||||
bail!(
|
||||
"Pageserver failed to start. See '{}' for details.",
|
||||
self.repo_path().join("pageserver.log").display()
|
||||
@@ -281,13 +254,12 @@ impl PageServerNode {
|
||||
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
|
||||
|
||||
let sig = if immediate {
|
||||
print!("Stopping pageserver immediately..");
|
||||
println!("Stop pageserver immediately");
|
||||
Signal::SIGQUIT
|
||||
} else {
|
||||
print!("Stopping pageserver gracefully..");
|
||||
println!("Stop pageserver gracefully");
|
||||
Signal::SIGTERM
|
||||
};
|
||||
io::stdout().flush().unwrap();
|
||||
match kill(pid, sig) {
|
||||
Ok(_) => (),
|
||||
Err(Errno::ESRCH) => {
|
||||
@@ -309,36 +281,25 @@ impl PageServerNode {
|
||||
// TODO Remove this "timeout" and handle it on caller side instead.
|
||||
// Shutting down may take a long time,
|
||||
// if pageserver checkpoints a lot of data
|
||||
let mut tcp_stopped = false;
|
||||
for _ in 0..100 {
|
||||
if !tcp_stopped {
|
||||
if let Err(err) = TcpStream::connect(&address) {
|
||||
tcp_stopped = true;
|
||||
if err.kind() != io::ErrorKind::ConnectionRefused {
|
||||
eprintln!("\nPageserver connection failed with error: {err}");
|
||||
}
|
||||
}
|
||||
}
|
||||
if tcp_stopped {
|
||||
// Also check status on the HTTP port
|
||||
if let Err(_e) = TcpStream::connect(&address) {
|
||||
println!("Pageserver stopped receiving connections");
|
||||
|
||||
//Now check status
|
||||
match self.check_status() {
|
||||
Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
|
||||
println!("done!");
|
||||
return Ok(());
|
||||
Ok(_) => {
|
||||
println!("Pageserver status is OK. Wait a bit.");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
Err(err) => {
|
||||
eprintln!("\nPageserver status check failed with error: {err}");
|
||||
println!("Pageserver status is: {}", err);
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => {
|
||||
// keep waiting
|
||||
}
|
||||
}
|
||||
} else {
|
||||
println!("Pageserver still receives connections");
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
thread::sleep(Duration::from_secs(1));
|
||||
}
|
||||
|
||||
bail!("Failed to stop pageserver with pid {}", pid);
|
||||
@@ -381,36 +342,10 @@ impl PageServerNode {
|
||||
pub fn tenant_create(
|
||||
&self,
|
||||
new_tenant_id: Option<ZTenantId>,
|
||||
settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<Option<ZTenantId>> {
|
||||
let tenant_id_string = self
|
||||
.http_request(Method::POST, format!("{}/tenant", self.http_base_url))
|
||||
.json(&TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()?,
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()?,
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.json(&TenantCreateRequest { new_tenant_id })
|
||||
.send()?
|
||||
.error_from_body()?
|
||||
.json::<Option<String>>()?;
|
||||
@@ -427,35 +362,6 @@ impl PageServerNode {
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub fn tenant_config(&self, tenant_id: ZTenantId, settings: HashMap<&str, &str>) -> Result<()> {
|
||||
self.http_request(Method::PUT, format!("{}/tenant/config", self.http_base_url))
|
||||
.json(&TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: settings
|
||||
.get("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_target_size: settings
|
||||
.get("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.get("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
gc_horizon: settings
|
||||
.get("gc_horizon")
|
||||
.map(|x| x.parse::<u64>().unwrap()),
|
||||
gc_period: settings.get("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.get("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>().unwrap()),
|
||||
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
|
||||
})
|
||||
.send()?
|
||||
.error_from_body()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn timeline_list(&self, tenant_id: &ZTenantId) -> anyhow::Result<Vec<TimelineInfo>> {
|
||||
let timeline_infos: Vec<TimelineInfo> = self
|
||||
.http_request(
|
||||
@@ -493,12 +399,3 @@ impl PageServerNode {
|
||||
Ok(timeline_info_response)
|
||||
}
|
||||
}
|
||||
|
||||
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
|
||||
for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
|
||||
if let Ok(value) = std::env::var(env_key) {
|
||||
cmd = cmd.env(env_key, value);
|
||||
}
|
||||
}
|
||||
cmd
|
||||
}
|
||||
|
||||
@@ -1,20 +1,13 @@
|
||||
#!/bin/sh
|
||||
set -eux
|
||||
|
||||
broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
|
||||
if [ "$broker_endpoints_param" != "absent" ]; then
|
||||
broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
|
||||
else
|
||||
broker_endpoints_param=''
|
||||
fi
|
||||
|
||||
if [ "$1" = 'pageserver' ]; then
|
||||
if [ ! -d "/data/tenants" ]; then
|
||||
echo "Initializing pageserver data directory"
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
|
||||
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10"
|
||||
fi
|
||||
echo "Staring pageserver at 0.0.0.0:6400"
|
||||
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
|
||||
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data
|
||||
else
|
||||
"$@"
|
||||
fi
|
||||
|
||||
@@ -7,8 +7,8 @@
|
||||
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
|
||||
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
|
||||
- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout.
|
||||
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
|
||||
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
|
||||
- [pageserver/README](/pageserver/README) — pageserver overview.
|
||||
- [postgres_ffi/README](/postgres_ffi/README) — Postgres FFI overview.
|
||||
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
|
||||
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
|
||||
- [walkeeper/README](/walkeeper/README) — WAL service overview.
|
||||
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
|
||||
|
||||
@@ -27,4 +27,4 @@ management_token = jwt.encode({"scope": "pageserverapi"}, auth_keys.priv, algori
|
||||
tenant_token = jwt.encode({"scope": "tenant", "tenant_id": ps.initial_tenant}, auth_keys.priv, algorithm="RS256")
|
||||
```
|
||||
|
||||
Utility functions to work with jwts in rust are located in libs/utils/src/auth.rs
|
||||
Utility functions to work with jwts in rust are located in zenith_utils/src/auth.rs
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
# Docker images of Neon
|
||||
# Docker images of Zenith
|
||||
|
||||
## Images
|
||||
|
||||
Currently we build two main images:
|
||||
|
||||
- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
|
||||
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
|
||||
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
|
||||
|
||||
And additional intermediate image:
|
||||
And additional intermediate images:
|
||||
|
||||
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
|
||||
- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
|
||||
|
||||
## Building pipeline
|
||||
|
||||
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
|
||||
1. Image `zenithdb/compute-tools` is re-built automatically.
|
||||
|
||||
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
|
||||
2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
|
||||
|
||||
2. `neondatabase/neon`
|
||||
3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.
|
||||
|
||||
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.
|
||||
|
||||
### Branch
|
||||
|
||||
We can create branch at certain LSN using `neon_local timeline branch` command.
|
||||
We can create branch at certain LSN using `zenith timeline branch` command.
|
||||
Each Branch lives in a corresponding timeline[] and has an ancestor[].
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ Each Branch lives in a corresponding timeline[] and has an ancestor[].
|
||||
|
||||
NOTE: This is an overloaded term.
|
||||
|
||||
A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint;
|
||||
A checkpoint record in the WAL marks a point in the WAL sequence at which it is guaranteed that all data files have been updated with all information from shared memory modified before that checkpoint;
|
||||
|
||||
### Checkpoint (Layered repository)
|
||||
|
||||
@@ -91,7 +91,7 @@ The layer map tracks what layers exist in a timeline.
|
||||
|
||||
### Layered repository
|
||||
|
||||
Neon repository implementation that keeps data in layers.
|
||||
Zenith repository implementation that keeps data in layers.
|
||||
### LSN
|
||||
|
||||
The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
|
||||
@@ -101,23 +101,23 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a
|
||||
Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
|
||||
Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.
|
||||
|
||||
In Postgres and Neon LSNs are used to describe certain points in WAL handling.
|
||||
In postgres and Zenith lsns are used to describe certain points in WAL handling.
|
||||
|
||||
PostgreSQL LSNs and functions to monitor them:
|
||||
* `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
|
||||
* `pg_current_wal_lsn()` - Returns the current write-ahead log write location.
|
||||
* `pg_current_wal_flush_lsn()` - Returns the current write-ahead log flush location.
|
||||
* `pg_last_wal_receive_lsn()` - Returns the last write-ahead log location that has been received and synced to disk by streaming replication. While streaming replication is in progress this will increase monotonically.
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
|
||||
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
|
||||
|
||||
Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
|
||||
Zenith safekeeper LSNs. For more check [walkeeper/README_PROTO.md](/walkeeper/README_PROTO.md)
|
||||
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
|
||||
* `RestartLSN`: position in WAL confirmed by all safekeepers.
|
||||
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
|
||||
* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
|
||||
|
||||
Neon pageserver LSNs:
|
||||
Zenith pageserver LSNs:
|
||||
* `last_record_lsn` - the end of last processed WAL record.
|
||||
* `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
|
||||
* `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
|
||||
@@ -132,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver.
|
||||
|
||||
### Pageserver
|
||||
|
||||
Neon storage engine: repositories + wal receiver + page service + wal redo.
|
||||
Zenith storage engine: repositories + wal receiver + page service + wal redo.
|
||||
|
||||
### Page service
|
||||
|
||||
@@ -184,13 +184,13 @@ relation exceeds that size, it is split into multiple segments.
|
||||
SLRUs include pg_clog, pg_multixact/members, and
|
||||
pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
|
||||
they don't need to be stored permanently (e.g. pg_subtrans),
|
||||
or we do not support them in neon yet (pg_commit_ts).
|
||||
or we do not support them in zenith yet (pg_commit_ts).
|
||||
|
||||
### Tenant (Multitenancy)
|
||||
Tenant represents a single customer, interacting with Neon.
|
||||
Tenant represents a single customer, interacting with Zenith.
|
||||
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
|
||||
One pageserver[] can serve multiple tenants at once.
|
||||
One safekeeper
|
||||
One safekeeper
|
||||
|
||||
See `docs/multitenancy.md` for more.
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in
|
||||
S3. WAL that has been archived to S3 can be removed from the
|
||||
safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
|
||||
```
|
||||
|
||||
+----------------+
|
||||
+-----> | WAL safekeeper |
|
||||
| +----------------+
|
||||
@@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space.
|
||||
\
|
||||
\
|
||||
\
|
||||
\ +--------+
|
||||
\ | |
|
||||
+------> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
\ +--------+
|
||||
\ | |
|
||||
+--> | S3 |
|
||||
| |
|
||||
+--------+
|
||||
|
||||
|
||||
```
|
||||
Every WAL safekeeper holds a section of WAL, and a VCL value.
|
||||
The WAL can be divided into three portions:
|
||||
|
||||
```
|
||||
|
||||
VCL LSN
|
||||
| |
|
||||
V V
|
||||
.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
|
||||
Archived WAL Completed WAL In-flight WAL
|
||||
```
|
||||
|
||||
|
||||
Note that all this WAL kept in a safekeeper is a contiguous section.
|
||||
This is different from Aurora: In Aurora, there can be holes in the
|
||||
|
||||
@@ -12,7 +12,7 @@ Init empty pageserver using `initdb` in temporary directory.
|
||||
|
||||
`--storage_dest=FILE_PREFIX | S3_PREFIX |...` option defines object storage type, all other parameters are passed via env variables. Inspired by WAL-G style naming : https://wal-g.readthedocs.io/STORAGES/.
|
||||
|
||||
Save`storage_dest` and other parameters in config.
|
||||
Save`storage_dest` and other parameters in config.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
|
||||
```
|
||||
@@ -21,7 +21,7 @@ zenith start
|
||||
```
|
||||
|
||||
#### 2. Restart pageserver (manually or crash-recovery).
|
||||
Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
|
||||
Take `storage_dest` from pageserver config, start pageserver from latest snapshot in `storage_dest`.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
|
||||
```
|
||||
@@ -32,7 +32,7 @@ zenith start
|
||||
Start pageserver from existing snapshot.
|
||||
Path to snapshot provided via `--snapshot_path=FILE_PREFIX | S3_PREFIX | ...`
|
||||
Do not save `snapshot_path` and `snapshot_format` in config, as it is a one-time operation.
|
||||
Save`storage_dest` parameters in config.
|
||||
Save`storage_dest` parameters in config.
|
||||
Push snapshots to `storage_dest` in background.
|
||||
```
|
||||
//I.e. we want to start zenith on top of existing $PGDATA and use s3 as a persistent storage.
|
||||
@@ -42,15 +42,15 @@ zenith start
|
||||
How to pass credentials needed for `snapshot_path`?
|
||||
|
||||
#### 4. Export.
|
||||
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
|
||||
Manually push snapshot to `snapshot_path` which differs from `storage_dest`
|
||||
Optionally set `snapshot_format`, which can be plain pgdata format or zenith format.
|
||||
```
|
||||
zenith export --snapshot_path=FILE_PREFIX --snapshot_format=pgdata
|
||||
```
|
||||
|
||||
#### Notes and questions
|
||||
- safekeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
|
||||
- walkeeper s3_offload should use same (similar) syntax for storage. How to set it in UI?
|
||||
- Why do we need `zenith init` as a separate command? Can't we init everything at first start?
|
||||
- We can think of better names for all options.
|
||||
- Export to plain postgres format will be useless, if we are not 100% compatible on page level.
|
||||
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
|
||||
I can recall at least one such difference - PD_WAL_LOGGED flag in pages.
|
||||
@@ -1,151 +0,0 @@
|
||||
# Dispatching a connection
|
||||
|
||||
For each client connection, Neon service needs to authenticate the
|
||||
connection, and route it to the right PostgreSQL instance.
|
||||
|
||||
## Authentication
|
||||
|
||||
There are three different ways to authenticate:
|
||||
|
||||
- anonymous; no authentication needed
|
||||
- PostgreSQL authentication
|
||||
- github single sign-on using browser
|
||||
|
||||
In anonymous access, the user doesn't need to perform any
|
||||
authentication at all. This can be used e.g. in interactive PostgreSQL
|
||||
documentation, allowing you to run the examples very quickly. Similar
|
||||
to sqlfiddle.com.
|
||||
|
||||
PostgreSQL authentication works the same as always. All the different
|
||||
PostgreSQL authentication options like SCRAM, kerberos, etc. are
|
||||
available. [1]
|
||||
|
||||
The third option is to authenticate with github single sign-on. When
|
||||
you open the connection in psql, you get a link that you open with
|
||||
your browser. Opening the link redirects you to github authentication,
|
||||
and lets the connection to proceed. This is also known as "Link auth" [2].
|
||||
|
||||
|
||||
## Routing the connection
|
||||
|
||||
When a client starts a connection, it needs to be routed to the
|
||||
correct PostgreSQL instance. Routing can be done by the proxy, acting
|
||||
as a man-in-the-middle, or the connection can be routed at the network
|
||||
level based on the hostname or IP address.
|
||||
|
||||
Either way, Neon needs to identify which PostgreSQL instance the
|
||||
connection should be routed to. If the instance is not already
|
||||
running, it needs to be started. Some connections always require a new
|
||||
PostgreSQL instance to be created, e.g. if you want to run a one-off
|
||||
query against a particular point-in-time.
|
||||
|
||||
The PostgreSQL instance is identified by:
|
||||
- Neon account (possibly anonymous)
|
||||
- cluster (known as tenant in the storage?)
|
||||
- branch or snapshot name
|
||||
- timestamp (PITR)
|
||||
- primary or read-replica
|
||||
- one-off read replica
|
||||
- one-off writeable branch
|
||||
|
||||
When you are using regular PostgreSQL authentication or anonymous
|
||||
access, the connection URL needs to contain all the information needed
|
||||
for the routing. With github single sign-on, the browser is involved
|
||||
and some details - the Neon account in particular - can be deduced
|
||||
from the authentication exchange.
|
||||
|
||||
There are three methods for identifying the PostgreSQL instance:
|
||||
|
||||
- Browser interaction (link auth)
|
||||
- Options in the connection URL and the domain name
|
||||
- A pre-defined endpoint, identified by domain name or IP address
|
||||
|
||||
### Link Auth
|
||||
|
||||
postgres://<username>@start.neon.tech/<dbname>
|
||||
|
||||
This gives you a link that you open in browser. Clicking the link
|
||||
performs github authentication, and the Neon account name is
|
||||
provided to the proxy behind the scenes. The proxy routes the
|
||||
connection to the primary PostgreSQL instance in cluster called
|
||||
"main", branch "main".
|
||||
|
||||
Further ideas:
|
||||
- You could pre-define a different target for link auth
|
||||
connections in the UI.
|
||||
- You could have a drop-down in the browser, allowing you to connect
|
||||
to any cluster you want. Link Auth can be like Teleport.
|
||||
|
||||
### Connection URL
|
||||
|
||||
The connection URL looks like this:
|
||||
|
||||
postgres://<username>@<cluster-id>.db.neon.tech/<dbname>
|
||||
|
||||
By default, this connects you to the primary PostgreSQL instance
|
||||
running on the "main" branch in the named cluster [3]. However, you can
|
||||
change that by specifying options in the connection URL. The following
|
||||
options are supported:
|
||||
|
||||
| option name | Description | Examples |
|
||||
| --- | --- | --- |
|
||||
| cluster | Cluster name | cluster:myproject |
|
||||
| branch | Branch name | branch:main |
|
||||
| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z |
|
||||
| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 |
|
||||
| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new |
|
||||
|
||||
For example, to read branch 'testing' as it was on Mar 31, 2022, you could
|
||||
specify a timestamp in the connection URL [4]:
|
||||
|
||||
postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31
|
||||
|
||||
Connecting with cluster name and options can be disabled in the UI. If
|
||||
disabled, you can only connect using a pre-defined endpoint.
|
||||
|
||||
### Pre-defined Endpoint
|
||||
|
||||
Instead of providing the cluster name, branch, and all those options
|
||||
in the connection URL, you can define a named endpoint with the same
|
||||
options.
|
||||
|
||||
In the UI, click "create endpoint". Fill in the details:
|
||||
|
||||
- Cluster name
|
||||
- Branch
|
||||
- timestamp or LSN
|
||||
- is this for the primary or for a read replica
|
||||
- etc.
|
||||
|
||||
When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect:
|
||||
|
||||
postgres://<username>@<endpoint-id>.endpoint.neon.tech/<dbname>
|
||||
|
||||
|
||||
An endpoint can be assigned a static or dynamic IP address, so that
|
||||
you can connect to it with clients that don't support TLS SNI. Maybe
|
||||
bypass the proxy altogether, but that ought to be invisible to the
|
||||
user.
|
||||
|
||||
You can limit the range of source IP addresses that are allowed to
|
||||
connect to an endpoint. An endpoint can also be exposed in an Amazon
|
||||
VPC, allowing direct connections from applications.
|
||||
|
||||
|
||||
# Footnotes
|
||||
|
||||
[1] I'm not sure how feasible it is to set up configure like Kerberos
|
||||
or LDAP in a cloud environment. But in principle I think we should
|
||||
allow customers to have the full power of PostgreSQL, including all
|
||||
authentication options. However, it's up to the customer to configure
|
||||
it correctly.
|
||||
|
||||
[2] Link is a way to both authenticate and to route the connection
|
||||
|
||||
[3] This assumes that cluster-ids are globally unique, across all
|
||||
Neon accounts.
|
||||
|
||||
[4] The syntax accepted in the connection URL is limited by libpq. The
|
||||
only way to pass arbitrary options to the server (or our proxy) is
|
||||
with the "options" keyword, and the options must be percent-encoded. I
|
||||
think the above would work but i haven't tested it
|
||||
@@ -1,79 +0,0 @@
|
||||
Cluster size limits
|
||||
==================
|
||||
|
||||
## Summary
|
||||
|
||||
One of the resource consumption limits for free-tier users is a cluster size limit.
|
||||
|
||||
To enforce it, we need to calculate the timeline size and check if the limit is reached before relation create/extend operations.
|
||||
If the limit is reached, the query must fail with some meaningful error/warning.
|
||||
We may want to exempt some operations from the quota to allow users free space to fit back into the limit.
|
||||
|
||||
The stateless compute node that performs validation is separate from the storage that calculates the usage, so we need to exchange cluster size information between those components.
|
||||
|
||||
## Motivation
|
||||
|
||||
Limit the maximum size of a PostgreSQL instance to limit free tier users (and other tiers in the future).
|
||||
First of all, this is needed to control our free tier production costs.
|
||||
Another reason to limit resources is risk management — we haven't (fully) tested and optimized zenith for big clusters,
|
||||
so we don't want to give users access to the functionality that we don't think is ready.
|
||||
|
||||
## Components
|
||||
|
||||
* pageserver - calculate the size consumed by a timeline and add it to the feedback message.
|
||||
* safekeeper - pass feedback message from pageserver to compute.
|
||||
* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`.
|
||||
* console - set and update `zenith.max_cluster_size` setting
|
||||
|
||||
## Proposed implementation
|
||||
|
||||
First of all, it's necessary to define timeline size.
|
||||
|
||||
The current approach is to count all data, including SLRUs. (not including WAL)
|
||||
Here we think of it as a physical disk underneath the Postgres cluster.
|
||||
This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver.
|
||||
|
||||
Alternatively, we could count only relation data. As in pg_database_size().
|
||||
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
|
||||
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
|
||||
We will need to refactor the timeline_size counter or add another counter to implement it.
|
||||
|
||||
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
|
||||
Then this size should be reported to compute node.
|
||||
|
||||
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.`
|
||||
|
||||
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
|
||||
|
||||
This message is received by the safekeeper and propagated to compute node as a part of `AppendResponse`.
|
||||
|
||||
Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable.
|
||||
|
||||
And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so.
|
||||
(see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html))
|
||||
|
||||
TODO:
|
||||
We can allow autovacuum processes to bypass this check, simply checking `IsAutoVacuumWorkerProcess()`.
|
||||
It would be nice to allow manual VACUUM and VACUUM FULL to bypass the check, but it's uneasy to distinguish these operations at the low level.
|
||||
See issues https://github.com/neondatabase/neon/issues/1245
|
||||
https://github.com/zenithdb/zenith/issues/1445
|
||||
|
||||
TODO:
|
||||
We should warn users if the limit is soon to be reached.
|
||||
|
||||
### **Reliability, failure modes and corner cases**
|
||||
|
||||
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
|
||||
|
||||
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
|
||||
|
||||
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
|
||||
|
||||
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
|
||||
|
||||
|
||||
### **Security implications**
|
||||
|
||||
We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM.
|
||||
Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check.
|
||||
To cover this case, we also monitor the compute node size in the console.
|
||||
@@ -6,6 +6,7 @@ If there's no such file during `init` phase of the server, it creates the file i
|
||||
There's a possibility to pass an arbitrary config value to the pageserver binary as an argument: such values override
|
||||
the values in the config file, if any are specified for the same key and get into the final config during init phase.
|
||||
|
||||
|
||||
### Config example
|
||||
|
||||
```toml
|
||||
@@ -25,22 +26,18 @@ max_file_descriptors = '100'
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
initial_superuser_name = 'zenith_admin'
|
||||
|
||||
broker_etcd_prefix = 'neon'
|
||||
broker_endpoints = ['some://etcd']
|
||||
|
||||
# [remote_storage]
|
||||
```
|
||||
|
||||
The config above shows default values for all basic pageserver settings, besides `broker_endpoints`: that one has to be set by the user,
|
||||
see the corresponding section below.
|
||||
The config above shows default values for all basic pageserver settings.
|
||||
Pageserver uses default values for all files that are missing in the config, so it's not a hard error to leave the config blank.
|
||||
Yet, it validates the config values it can (e.g. postgres install dir) and errors if the validation fails, refusing to start.
|
||||
|
||||
Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#table) in TOML specification and
|
||||
|
||||
- either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
* either has to be placed in the config after the table-less values such as `initial_superuser_name = 'zenith_admin'`
|
||||
|
||||
- or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
* or can be placed anywhere if rewritten in identical form as [inline table](https://toml.io/en/v1.0.0#inline-table): `remote_storage = {foo = 2}`
|
||||
|
||||
### Config values
|
||||
|
||||
@@ -50,17 +47,6 @@ Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage=
|
||||
|
||||
Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.
|
||||
|
||||
#### broker_endpoints
|
||||
|
||||
A list of endpoints (etcd currently) to connect and pull the information from.
|
||||
Mandatory, does not have a default, since requires etcd to be started as a separate process,
|
||||
and its connection url should be specified separately.
|
||||
|
||||
#### broker_etcd_prefix
|
||||
|
||||
A prefix to add for every etcd key used, to separate one group of related instances from another, in the same cluster.
|
||||
Default is `neon`.
|
||||
|
||||
#### checkpoint_distance
|
||||
|
||||
`checkpoint_distance` is the amount of incoming WAL that is held in
|
||||
@@ -71,7 +57,7 @@ but it will trigger a checkpoint operation to get it back below the
|
||||
limit.
|
||||
|
||||
`checkpoint_distance` also determines how much WAL needs to be kept
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
durable in the safekeeper. The safekeeper must have capacity to hold
|
||||
this much WAL, with some headroom, otherwise you can get stuck in a
|
||||
situation where the safekeeper is full and stops accepting new WAL,
|
||||
but the pageserver is not flushing out and releasing the space in the
|
||||
@@ -86,11 +72,7 @@ The unit is # of bytes.
|
||||
|
||||
Every `compaction_period` seconds, the page server checks if
|
||||
maintenance operations, like compaction, are needed on the layer
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### compaction_target_size
|
||||
|
||||
File sizes for L0 delta and L1 image layers. Default is 128MB.
|
||||
files. Default is 1 s, which should be fine.
|
||||
|
||||
#### gc_horizon
|
||||
|
||||
@@ -103,14 +85,6 @@ away.
|
||||
|
||||
Interval at which garbage collection is triggered. Default is 100 s.
|
||||
|
||||
#### image_creation_threshold
|
||||
|
||||
L0 delta layer threshold for L1 iamge layer creation. Default is 3.
|
||||
|
||||
#### pitr_interval
|
||||
|
||||
WAL retention duration for PITR branching. Default is 30 days.
|
||||
|
||||
#### initial_superuser_name
|
||||
|
||||
Name of the initial superuser role, passed to initdb when a new tenant
|
||||
@@ -177,11 +151,12 @@ bucket_region = 'eu-north-1'
|
||||
# Optional, pageserver uses entire bucket if the prefix is not specified.
|
||||
prefix_in_bucket = '/some/prefix/'
|
||||
|
||||
# S3 API query limit to avoid getting errors/throttling from AWS.
|
||||
concurrency_limit = 100
|
||||
```
|
||||
# Access key to connect to the bucket ("login" part of the credentials)
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
|
||||
If no IAM bucket access is used during the remote storage usage, use the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables to set the access credentials.
|
||||
# Secret access key to connect to the bucket ("password" part of the credentials)
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
```
|
||||
|
||||
###### General remote storage configuration
|
||||
|
||||
@@ -192,13 +167,14 @@ Besides, there are parameters common for all types of remote storage that can be
|
||||
|
||||
```toml
|
||||
[remote_storage]
|
||||
# Max number of concurrent timeline synchronized (layers uploaded or downloaded) with the remote storage at the same time.
|
||||
max_concurrent_syncs = 50
|
||||
# Max number of concurrent connections to open for uploading to or downloading from the remote storage.
|
||||
max_concurrent_sync = 100
|
||||
|
||||
# Max number of errors a single task can have before it's considered failed and not attempted to run anymore.
|
||||
max_sync_errors = 10
|
||||
```
|
||||
|
||||
|
||||
## safekeeper
|
||||
|
||||
TODO
|
||||
|
||||
@@ -28,7 +28,12 @@ The pageserver has a few different duties:
|
||||
- Receive WAL from the WAL service and decode it.
|
||||
- Replay WAL that's applicable to the chunks that the Page Server maintains
|
||||
|
||||
For more detailed info, see [/pageserver/README](/pageserver/README.md)
|
||||
For more detailed info, see `/pageserver/README`
|
||||
|
||||
`/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/proxy`:
|
||||
|
||||
@@ -52,12 +57,12 @@ PostgreSQL extension that implements storage manager API and network communicati
|
||||
|
||||
PostgreSQL extension that contains functions needed for testing and debugging.
|
||||
|
||||
`/safekeeper`:
|
||||
`/walkeeper`:
|
||||
|
||||
The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
|
||||
It acts as a holding area and redistribution center for recently generated WAL.
|
||||
|
||||
For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
|
||||
For more detailed info, see `/walkeeper/README`
|
||||
|
||||
`/workspace_hack`:
|
||||
The workspace_hack crate exists only to pin down some dependencies.
|
||||
@@ -69,21 +74,14 @@ We use [cargo-hakari](https://crates.io/crates/cargo-hakari) for automation.
|
||||
Main entry point for the 'zenith' CLI utility.
|
||||
TODO: Doesn't it belong to control_plane?
|
||||
|
||||
`/libs`:
|
||||
Unites granular neon helper crates under the hood.
|
||||
`/zenith_metrics`:
|
||||
|
||||
`/libs/postgres_ffi`:
|
||||
|
||||
Utility functions for interacting with PostgreSQL file formats.
|
||||
Misc constants, copied from PostgreSQL headers.
|
||||
|
||||
`/libs/utils`:
|
||||
Generic helpers that are shared between other crates in this repository.
|
||||
A subject for future modularization.
|
||||
|
||||
`/libs/metrics`:
|
||||
Helpers for exposing Prometheus metrics from the server.
|
||||
|
||||
`/zenith_utils`:
|
||||
|
||||
Helpers that are shared between other crates in this repository.
|
||||
|
||||
## Using Python
|
||||
Note that Debian/Ubuntu Python packages are stale, as it commonly happens,
|
||||
so manual installation of dependencies is not recommended.
|
||||
@@ -91,22 +89,18 @@ so manual installation of dependencies is not recommended.
|
||||
A single virtual environment with all dependencies is described in the single `Pipfile`.
|
||||
|
||||
### Prerequisites
|
||||
- Install Python 3.9 (the minimal supported version) or greater.
|
||||
- Install Python 3.7 (the minimal supported version) or greater.
|
||||
- Our setup with poetry should work with newer python versions too. So feel free to open an issue with a `c/test-runner` label if something doesnt work as expected.
|
||||
- If you have some trouble with other version you can resolve it by installing Python 3.9 separately, via [pyenv](https://github.com/pyenv/pyenv) or via system package manager e.g.:
|
||||
- If you have some trouble with other version you can resolve it by installing Python 3.7 separately, via pyenv or via system package manager e.g.:
|
||||
```bash
|
||||
# In Ubuntu
|
||||
sudo add-apt-repository ppa:deadsnakes/ppa
|
||||
sudo apt update
|
||||
sudo apt install python3.9
|
||||
sudo apt install python3.7
|
||||
```
|
||||
- Install `poetry`
|
||||
- Exact version of `poetry` is not important, see installation instructions available at poetry's [website](https://python-poetry.org/docs/#installation)`.
|
||||
- Install dependencies via `./scripts/pysync`.
|
||||
- Note that CI uses specific Python version (look for `PYTHON_VERSION` [here](https://github.com/neondatabase/docker-images/blob/main/rust/Dockerfile))
|
||||
so if you have different version some linting tools can yield different result locally vs in the CI.
|
||||
- You can explicitly specify which Python to use by running `poetry env use /path/to/python`, e.g. `poetry env use python3.9`.
|
||||
This may also disable the `The currently activated Python version X.Y.Z is not supported by the project` warning.
|
||||
- Install dependencies via `./scripts/pysync`. Note that CI uses Python 3.7 so if you have different version some linting tools can yield different result locally vs in the CI.
|
||||
|
||||
Run `poetry shell` to activate the virtual environment.
|
||||
Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
[package]
|
||||
name = "etcd_broker"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
etcd-client = "0.9.0"
|
||||
regex = "1.4.5"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
|
||||
utils = { path = "../utils" }
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
tokio = "1"
|
||||
tracing = "0.1"
|
||||
thiserror = "1"
|
||||
@@ -1,348 +0,0 @@
|
||||
//! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
|
||||
//! Intended to connect services to each other, not to store their data.
|
||||
use std::{
|
||||
collections::{hash_map, HashMap},
|
||||
fmt::Display,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use regex::{Captures, Regex};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
|
||||
pub use etcd_client::*;
|
||||
|
||||
use tokio::{sync::mpsc, task::JoinHandle};
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
lsn::Lsn,
|
||||
zid::{NodeId, ZTenantId, ZTenantTimelineId},
|
||||
};
|
||||
|
||||
/// Default value to use for prefixing to all etcd keys with.
|
||||
/// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
|
||||
pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
struct SafekeeperTimeline {
|
||||
safekeeper_id: NodeId,
|
||||
info: SkTimelineInfo,
|
||||
}
|
||||
|
||||
/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
|
||||
#[serde_as]
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct SkTimelineInfo {
|
||||
/// Term of the last entry.
|
||||
pub last_log_term: Option<u64>,
|
||||
/// LSN of the last record.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub flush_lsn: Option<Lsn>,
|
||||
/// Up to which LSN safekeeper regards its WAL as committed.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub commit_lsn: Option<Lsn>,
|
||||
/// LSN up to which safekeeper offloaded WAL to s3.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub s3_wal_lsn: Option<Lsn>,
|
||||
/// LSN of last checkpoint uploaded by pageserver.
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub remote_consistent_lsn: Option<Lsn>,
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
#[serde(default)]
|
||||
pub peer_horizon_lsn: Option<Lsn>,
|
||||
#[serde(default)]
|
||||
pub safekeeper_connection_string: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum BrokerError {
|
||||
#[error("Etcd client error: {0}. Context: {1}")]
|
||||
EtcdClient(etcd_client::Error, String),
|
||||
#[error("Error during parsing etcd data: {0}")]
|
||||
ParsingError(String),
|
||||
#[error("Internal error: {0}")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
/// A way to control the data retrieval from a certain subscription.
|
||||
pub struct SkTimelineSubscription {
|
||||
safekeeper_timeline_updates:
|
||||
mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>>,
|
||||
kind: SkTimelineSubscriptionKind,
|
||||
watcher_handle: JoinHandle<Result<(), BrokerError>>,
|
||||
watcher: Watcher,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscription {
|
||||
/// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
|
||||
pub async fn fetch_data(
|
||||
&mut self,
|
||||
) -> Option<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>> {
|
||||
self.safekeeper_timeline_updates.recv().await
|
||||
}
|
||||
|
||||
/// Cancels the subscription, stopping the data poller and waiting for it to shut down.
|
||||
pub async fn cancel(mut self) -> Result<(), BrokerError> {
|
||||
self.watcher.cancel().await.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!(
|
||||
"Failed to cancel timeline subscription, kind: {:?}",
|
||||
self.kind
|
||||
),
|
||||
)
|
||||
})?;
|
||||
self.watcher_handle.await.map_err(|e| {
|
||||
BrokerError::InternalError(format!(
|
||||
"Failed to join the timeline updates task, kind: {:?}, error: {e}",
|
||||
self.kind
|
||||
))
|
||||
})?
|
||||
}
|
||||
}
|
||||
|
||||
/// The subscription kind to the timeline updates from safekeeper.
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct SkTimelineSubscriptionKind {
|
||||
broker_etcd_prefix: String,
|
||||
kind: SubscriptionKind,
|
||||
}
|
||||
|
||||
impl SkTimelineSubscriptionKind {
|
||||
pub fn all(broker_etcd_prefix: String) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::All,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::Tenant(tenant),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self {
|
||||
Self {
|
||||
broker_etcd_prefix,
|
||||
kind: SubscriptionKind::Timeline(timeline),
|
||||
}
|
||||
}
|
||||
|
||||
fn watch_regex(&self) -> Regex {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => Regex::new(&format!(
|
||||
r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'everything' subscription"),
|
||||
SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'tenant' subscription"),
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => Regex::new(&format!(
|
||||
r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$",
|
||||
self.broker_etcd_prefix
|
||||
))
|
||||
.expect("wrong regex for 'timeline' subscription"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Etcd key to use for watching a certain timeline updates from safekeepers.
|
||||
pub fn watch_key(&self) -> String {
|
||||
match self.kind {
|
||||
SubscriptionKind::All => self.broker_etcd_prefix.to_string(),
|
||||
SubscriptionKind::Tenant(tenant_id) => {
|
||||
format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix)
|
||||
}
|
||||
SubscriptionKind::Timeline(ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => format!(
|
||||
"{}/{tenant_id}/{timeline_id}/safekeeper",
|
||||
self.broker_etcd_prefix
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
enum SubscriptionKind {
|
||||
/// Get every timeline update.
|
||||
All,
|
||||
/// Get certain tenant timelines' updates.
|
||||
Tenant(ZTenantId),
|
||||
/// Get certain timeline updates.
|
||||
Timeline(ZTenantTimelineId),
|
||||
}
|
||||
|
||||
/// Creates a background task to poll etcd for timeline updates from safekeepers.
|
||||
/// Stops and returns `Err` on any error during etcd communication.
|
||||
/// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
|
||||
/// exiting normally in such cases.
|
||||
pub async fn subscribe_to_safekeeper_timeline_updates(
|
||||
client: &mut Client,
|
||||
subscription: SkTimelineSubscriptionKind,
|
||||
) -> Result<SkTimelineSubscription, BrokerError> {
|
||||
info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
|
||||
|
||||
let (watcher, mut stream) = client
|
||||
.watch(
|
||||
subscription.watch_key(),
|
||||
Some(WatchOptions::new().with_prefix()),
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
BrokerError::EtcdClient(
|
||||
e,
|
||||
format!("Failed to init the watch for subscription {subscription:?}"),
|
||||
)
|
||||
})?;
|
||||
|
||||
let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
|
||||
|
||||
let subscription_kind = subscription.kind;
|
||||
let regex = subscription.watch_regex();
|
||||
let watcher_handle = tokio::spawn(async move {
|
||||
while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
|
||||
"Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}"
|
||||
)))? {
|
||||
if resp.canceled() {
|
||||
info!("Watch for timeline updates subscription was canceled, exiting");
|
||||
break;
|
||||
}
|
||||
|
||||
let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>> = HashMap::new();
|
||||
// Keep track that the timeline data updates from etcd arrive in the right order.
|
||||
// https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
|
||||
// > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
|
||||
let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
|
||||
|
||||
|
||||
let events = resp.events();
|
||||
debug!("Processing {} events", events.len());
|
||||
|
||||
for event in events {
|
||||
if EventType::Put == event.event_type() {
|
||||
if let Some(new_etcd_kv) = event.kv() {
|
||||
let new_kv_version = new_etcd_kv.version();
|
||||
|
||||
match parse_etcd_key_value(subscription_kind, ®ex, new_etcd_kv) {
|
||||
Ok(Some((zttid, timeline))) => {
|
||||
match timeline_updates
|
||||
.entry(zttid)
|
||||
.or_default()
|
||||
.entry(timeline.safekeeper_id)
|
||||
{
|
||||
hash_map::Entry::Occupied(mut o) => {
|
||||
let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
|
||||
if old_etcd_kv_version < new_kv_version {
|
||||
o.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
hash_map::Entry::Vacant(v) => {
|
||||
v.insert(timeline.info);
|
||||
timeline_etcd_versions.insert(zttid,new_kv_version);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(None) => {}
|
||||
Err(e) => error!("Failed to parse timeline update: {e}"),
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Err(e) = timeline_updates_sender.send(timeline_updates) {
|
||||
info!("Timeline updates sender got dropped, exiting: {e}");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
});
|
||||
|
||||
Ok(SkTimelineSubscription {
|
||||
kind: subscription,
|
||||
safekeeper_timeline_updates,
|
||||
watcher_handle,
|
||||
watcher,
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_etcd_key_value(
|
||||
subscription_kind: SubscriptionKind,
|
||||
regex: &Regex,
|
||||
kv: &KeyValue,
|
||||
) -> Result<Option<(ZTenantTimelineId, SafekeeperTimeline)>, BrokerError> {
|
||||
let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str"))
|
||||
})?) {
|
||||
caps
|
||||
} else {
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let (zttid, safekeeper_id) = match subscription_kind {
|
||||
SubscriptionKind::All => (
|
||||
ZTenantTimelineId::new(
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Tenant(tenant_id) => (
|
||||
ZTenantTimelineId::new(
|
||||
tenant_id,
|
||||
parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
|
||||
),
|
||||
NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
SubscriptionKind::Timeline(zttid) => (
|
||||
zttid,
|
||||
NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
|
||||
),
|
||||
};
|
||||
|
||||
let info_str = kv.value_str().map_err(|e| {
|
||||
BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str"))
|
||||
})?;
|
||||
Ok(Some((
|
||||
zttid,
|
||||
SafekeeperTimeline {
|
||||
safekeeper_id,
|
||||
info: serde_json::from_str(info_str).map_err(|e| {
|
||||
BrokerError::ParsingError(format!(
|
||||
"Failed to parse '{info_str}' as safekeeper timeline info: {e}"
|
||||
))
|
||||
})?,
|
||||
},
|
||||
)))
|
||||
}
|
||||
|
||||
fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: Display,
|
||||
{
|
||||
let capture_match = caps
|
||||
.get(index)
|
||||
.ok_or_else(|| format!("Failed to get capture match at index {index}"))?
|
||||
.as_str();
|
||||
capture_match.parse().map_err(|e| {
|
||||
format!(
|
||||
"Failed to parse {} from {capture_match}: {e}",
|
||||
std::any::type_name::<T>()
|
||||
)
|
||||
})
|
||||
}
|
||||
@@ -1,11 +0,0 @@
|
||||
[package]
|
||||
name = "metrics"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
|
||||
libc = "0.2"
|
||||
lazy_static = "1.4"
|
||||
once_cell = "1.8.0"
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -1,14 +0,0 @@
|
||||
[package]
|
||||
name = "wal_generate"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0"
|
||||
clap = "3.0"
|
||||
env_logger = "0.9"
|
||||
log = "0.4"
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tempfile = "3.2"
|
||||
@@ -1,58 +0,0 @@
|
||||
use anyhow::*;
|
||||
use clap::{App, Arg};
|
||||
use wal_generate::*;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
env_logger::Builder::from_env(
|
||||
env_logger::Env::default().default_filter_or("wal_generate=info"),
|
||||
)
|
||||
.init();
|
||||
let arg_matches = App::new("Postgres WAL generator")
|
||||
.about("Generates Postgres databases with specific WAL properties")
|
||||
.arg(
|
||||
Arg::new("datadir")
|
||||
.short('D')
|
||||
.long("datadir")
|
||||
.takes_value(true)
|
||||
.help("Data directory for the Postgres server")
|
||||
.required(true)
|
||||
)
|
||||
.arg(
|
||||
Arg::new("pg-distrib-dir")
|
||||
.long("pg-distrib-dir")
|
||||
.takes_value(true)
|
||||
.help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
|
||||
.default_value("/usr/local")
|
||||
)
|
||||
.arg(
|
||||
Arg::new("type")
|
||||
.long("type")
|
||||
.takes_value(true)
|
||||
.help("Type of WAL to generate")
|
||||
.possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
|
||||
.required(true)
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let cfg = Conf {
|
||||
pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
|
||||
datadir: arg_matches.value_of("datadir").unwrap().into(),
|
||||
};
|
||||
cfg.initdb()?;
|
||||
let mut srv = cfg.start_server()?;
|
||||
let lsn = match arg_matches.value_of("type").unwrap() {
|
||||
"simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
|
||||
"last_wal_record_crossing_segment" => {
|
||||
generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
|
||||
}
|
||||
"wal_record_crossing_segment_followed_by_small_one" => {
|
||||
generate_wal_record_crossing_segment_followed_by_small_one(
|
||||
&mut srv.connect_with_timeout()?,
|
||||
)?
|
||||
}
|
||||
a => panic!("Unknown --type argument: {}", a),
|
||||
};
|
||||
println!("end_of_wal = {}", lsn);
|
||||
srv.kill();
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,278 +0,0 @@
|
||||
use anyhow::*;
|
||||
use core::time::Duration;
|
||||
use log::*;
|
||||
use postgres::types::PgLsn;
|
||||
use postgres::Client;
|
||||
use std::cmp::Ordering;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::{Command, Stdio};
|
||||
use std::time::Instant;
|
||||
use tempfile::{tempdir, TempDir};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Conf {
|
||||
pub pg_distrib_dir: PathBuf,
|
||||
pub datadir: PathBuf,
|
||||
}
|
||||
|
||||
pub struct PostgresServer {
|
||||
process: std::process::Child,
|
||||
_unix_socket_dir: TempDir,
|
||||
client_config: postgres::Config,
|
||||
}
|
||||
|
||||
impl Conf {
|
||||
fn pg_bin_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("bin")
|
||||
}
|
||||
|
||||
fn pg_lib_dir(&self) -> PathBuf {
|
||||
self.pg_distrib_dir.join("lib")
|
||||
}
|
||||
|
||||
fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
|
||||
let path = self.pg_bin_dir().join(command);
|
||||
ensure!(path.exists(), "Command {:?} does not exist", path);
|
||||
let mut cmd = Command::new(path);
|
||||
cmd.env_clear()
|
||||
.env("LD_LIBRARY_PATH", self.pg_lib_dir())
|
||||
.env("DYLD_LIBRARY_PATH", self.pg_lib_dir());
|
||||
Ok(cmd)
|
||||
}
|
||||
|
||||
pub fn initdb(&self) -> Result<()> {
|
||||
if let Some(parent) = self.datadir.parent() {
|
||||
info!("Pre-creating parent directory {:?}", parent);
|
||||
// Tests may be run concurrently and there may be a race to create `test_output/`.
|
||||
// std::fs::create_dir_all is guaranteed to have no races with another thread creating directories.
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
info!(
|
||||
"Running initdb in {:?} with user \"postgres\"",
|
||||
self.datadir
|
||||
);
|
||||
let output = self
|
||||
.new_pg_command("initdb")?
|
||||
.arg("-D")
|
||||
.arg(self.datadir.as_os_str())
|
||||
.args(&["-U", "postgres", "--no-instructions", "--no-sync"])
|
||||
.output()?;
|
||||
debug!("initdb output: {:?}", output);
|
||||
ensure!(
|
||||
output.status.success(),
|
||||
"initdb failed, stdout and stderr follow:\n{}{}",
|
||||
String::from_utf8_lossy(&output.stdout),
|
||||
String::from_utf8_lossy(&output.stderr),
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn start_server(&self) -> Result<PostgresServer> {
|
||||
info!("Starting Postgres server in {:?}", self.datadir);
|
||||
let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
|
||||
let unix_socket_dir_path = unix_socket_dir.path().to_owned();
|
||||
let server_process = self
|
||||
.new_pg_command("postgres")?
|
||||
.args(&["-c", "listen_addresses="])
|
||||
.arg("-k")
|
||||
.arg(unix_socket_dir_path.as_os_str())
|
||||
.arg("-D")
|
||||
.arg(self.datadir.as_os_str())
|
||||
.args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
|
||||
.args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
|
||||
.args(&["-c", "shared_preload_libraries=zenith"]) // can only be loaded at startup
|
||||
// Disable background processes as much as possible
|
||||
.args(&["-c", "wal_writer_delay=10s"])
|
||||
.args(&["-c", "autovacuum=off"])
|
||||
.stderr(Stdio::null())
|
||||
.spawn()?;
|
||||
let server = PostgresServer {
|
||||
process: server_process,
|
||||
_unix_socket_dir: unix_socket_dir,
|
||||
client_config: {
|
||||
let mut c = postgres::Config::new();
|
||||
c.host_path(&unix_socket_dir_path);
|
||||
c.user("postgres");
|
||||
c.connect_timeout(Duration::from_millis(1000));
|
||||
c
|
||||
},
|
||||
};
|
||||
Ok(server)
|
||||
}
|
||||
|
||||
pub fn pg_waldump(
|
||||
&self,
|
||||
first_segment_name: &str,
|
||||
last_segment_name: &str,
|
||||
) -> Result<std::process::Output> {
|
||||
let first_segment_file = self.datadir.join(first_segment_name);
|
||||
let last_segment_file = self.datadir.join(last_segment_name);
|
||||
info!(
|
||||
"Running pg_waldump for {} .. {}",
|
||||
first_segment_file.display(),
|
||||
last_segment_file.display()
|
||||
);
|
||||
let output = self
|
||||
.new_pg_command("pg_waldump")?
|
||||
.args(&[
|
||||
&first_segment_file.as_os_str(),
|
||||
&last_segment_file.as_os_str(),
|
||||
])
|
||||
.output()?;
|
||||
debug!("waldump output: {:?}", output);
|
||||
Ok(output)
|
||||
}
|
||||
}
|
||||
|
||||
impl PostgresServer {
|
||||
pub fn connect_with_timeout(&self) -> Result<Client> {
|
||||
let retry_until = Instant::now() + *self.client_config.get_connect_timeout().unwrap();
|
||||
while Instant::now() < retry_until {
|
||||
use std::result::Result::Ok;
|
||||
if let Ok(client) = self.client_config.connect(postgres::NoTls) {
|
||||
return Ok(client);
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
bail!("Connection timed out");
|
||||
}
|
||||
|
||||
pub fn kill(&mut self) {
|
||||
self.process.kill().unwrap();
|
||||
self.process.wait().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresServer {
|
||||
fn drop(&mut self) {
|
||||
use std::result::Result::Ok;
|
||||
match self.process.try_wait() {
|
||||
Ok(Some(_)) => return,
|
||||
Ok(None) => {
|
||||
warn!("Server was not terminated, will be killed");
|
||||
}
|
||||
Err(e) => {
|
||||
error!("Unable to get status of the server: {}, will be killed", e);
|
||||
}
|
||||
}
|
||||
let _ = self.process.kill();
|
||||
}
|
||||
}
|
||||
|
||||
pub trait PostgresClientExt: postgres::GenericClient {
|
||||
fn pg_current_wal_insert_lsn(&mut self) -> Result<PgLsn> {
|
||||
Ok(self
|
||||
.query_one("SELECT pg_current_wal_insert_lsn()", &[])?
|
||||
.get(0))
|
||||
}
|
||||
fn pg_current_wal_flush_lsn(&mut self) -> Result<PgLsn> {
|
||||
Ok(self
|
||||
.query_one("SELECT pg_current_wal_flush_lsn()", &[])?
|
||||
.get(0))
|
||||
}
|
||||
}
|
||||
|
||||
impl<C: postgres::GenericClient> PostgresClientExt for C {}
|
||||
|
||||
fn generate_internal<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
|
||||
) -> Result<PgLsn> {
|
||||
client.execute("create extension if not exists zenith_test_utils", &[])?;
|
||||
|
||||
let wal_segment_size = client.query_one(
|
||||
"select cast(setting as bigint) as setting, unit \
|
||||
from pg_settings where name = 'wal_segment_size'",
|
||||
&[],
|
||||
)?;
|
||||
ensure!(
|
||||
wal_segment_size.get::<_, String>("unit") == "B",
|
||||
"Unexpected wal_segment_size unit"
|
||||
);
|
||||
ensure!(
|
||||
wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024,
|
||||
"Unexpected wal_segment_size in bytes"
|
||||
);
|
||||
|
||||
let initial_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
info!("LSN initial = {}", initial_lsn);
|
||||
|
||||
let last_lsn = match f(client, initial_lsn)? {
|
||||
None => client.pg_current_wal_insert_lsn()?,
|
||||
Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were inserted after the generated WAL"),
|
||||
Ordering::Equal => last_lsn,
|
||||
Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
|
||||
},
|
||||
};
|
||||
|
||||
// Some records may be not flushed, e.g. non-transactional logical messages.
|
||||
client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
|
||||
match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
|
||||
Ordering::Less => bail!("Some records were flushed after the generated WAL"),
|
||||
Ordering::Equal => {}
|
||||
Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
|
||||
}
|
||||
Ok(last_lsn)
|
||||
}
|
||||
|
||||
pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
|
||||
generate_internal(client, |client, _| {
|
||||
client.execute("CREATE table t(x int)", &[])?;
|
||||
Ok(None)
|
||||
})
|
||||
}
|
||||
|
||||
fn generate_single_logical_message(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
transactional: bool,
|
||||
) -> Result<PgLsn> {
|
||||
generate_internal(client, |client, initial_lsn| {
|
||||
ensure!(
|
||||
initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
|
||||
"Initial LSN is too far in the future"
|
||||
);
|
||||
|
||||
let message_lsn: PgLsn = client
|
||||
.query_one(
|
||||
"select pg_logical_emit_message($1, 'big-16mb-msg', \
|
||||
concat(repeat('abcd', 16 * 256 * 1024), 'end')) as message_lsn",
|
||||
&[&transactional],
|
||||
)?
|
||||
.get("message_lsn");
|
||||
ensure!(
|
||||
message_lsn > PgLsn::from(0x0200_0000 + 4 * 8192),
|
||||
"Logical message did not cross the segment boundary"
|
||||
);
|
||||
ensure!(
|
||||
message_lsn < PgLsn::from(0x0400_0000),
|
||||
"Logical message crossed two segments"
|
||||
);
|
||||
|
||||
if transactional {
|
||||
// Transactional logical messages are part of a transaction, so the one above is
|
||||
// followed by a small COMMIT record.
|
||||
|
||||
let after_message_lsn = client.pg_current_wal_insert_lsn()?;
|
||||
ensure!(
|
||||
message_lsn < after_message_lsn,
|
||||
"No record found after the emitted message"
|
||||
);
|
||||
Ok(Some(after_message_lsn))
|
||||
} else {
|
||||
Ok(Some(message_lsn))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn generate_wal_record_crossing_segment_followed_by_small_one(
|
||||
client: &mut impl postgres::GenericClient,
|
||||
) -> Result<PgLsn> {
|
||||
generate_single_logical_message(client, true)
|
||||
}
|
||||
|
||||
pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
|
||||
client: &mut C,
|
||||
) -> Result<PgLsn> {
|
||||
generate_single_logical_message(client, false)
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
[package]
|
||||
name = "remote_storage"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
tokio = { version = "1.17", features = ["sync", "macros", "fs", "io-util"] }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
tracing = "0.1.27"
|
||||
rusoto_core = "0.48"
|
||||
rusoto_s3 = "0.48"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
async-trait = "0.1"
|
||||
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.2"
|
||||
@@ -1,233 +0,0 @@
|
||||
//! A set of generic storage abstractions for the page server to use when backing up and restoring its state from the external storage.
|
||||
//! No other modules from this tree are supposed to be used directly by the external code.
|
||||
//!
|
||||
//! [`RemoteStorage`] trait a CRUD-like generic abstraction to use for adapting external storages with a few implementations:
|
||||
//! * [`local_fs`] allows to use local file system as an external storage
|
||||
//! * [`s3_bucket`] uses AWS S3 bucket as an external storage
|
||||
//!
|
||||
mod local_fs;
|
||||
mod s3_bucket;
|
||||
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::HashMap,
|
||||
ffi::OsStr,
|
||||
num::{NonZeroU32, NonZeroUsize},
|
||||
path::{Path, PathBuf},
|
||||
};
|
||||
|
||||
use anyhow::Context;
|
||||
use tokio::io;
|
||||
use tracing::info;
|
||||
|
||||
pub use self::{
|
||||
local_fs::LocalFs,
|
||||
s3_bucket::{S3Bucket, S3ObjectKey},
|
||||
};
|
||||
|
||||
/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
|
||||
/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
|
||||
/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
|
||||
/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
/// Currently, sync happens with AWS S3, that has two limits on requests per second:
|
||||
/// ~200 RPS for IAM services
|
||||
/// https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html
|
||||
/// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
|
||||
/// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
|
||||
pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
|
||||
|
||||
/// Storage (potentially remote) API to manage its state.
|
||||
/// This storage tries to be unaware of any layered repository context,
|
||||
/// providing basic CRUD operations for storage files.
|
||||
#[async_trait::async_trait]
|
||||
pub trait RemoteStorage: Send + Sync {
|
||||
/// A way to uniquely reference a file in the remote storage.
|
||||
type RemoteObjectId;
|
||||
|
||||
/// Attempts to derive the storage path out of the local path, if the latter is correct.
|
||||
fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
|
||||
|
||||
/// Gets the download path of the given storage file.
|
||||
fn local_path(&self, remote_object_id: &Self::RemoteObjectId) -> anyhow::Result<PathBuf>;
|
||||
|
||||
/// Lists all items the storage has right now.
|
||||
async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
|
||||
|
||||
/// Streams the local file contents into remote into the remote storage entry.
|
||||
async fn upload(
|
||||
&self,
|
||||
from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
|
||||
// S3 PUT request requires the content length to be specified,
|
||||
// otherwise it starts to fail with the concurrent connection count increasing.
|
||||
from_size_bytes: usize,
|
||||
to: &Self::RemoteObjectId,
|
||||
metadata: Option<StorageMetadata>,
|
||||
) -> anyhow::Result<()>;
|
||||
|
||||
/// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
/// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
|
||||
/// Returns the metadata, if any was stored with the file previously.
|
||||
async fn download_byte_range(
|
||||
&self,
|
||||
from: &Self::RemoteObjectId,
|
||||
start_inclusive: u64,
|
||||
end_exclusive: Option<u64>,
|
||||
to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
|
||||
) -> anyhow::Result<Option<StorageMetadata>>;
|
||||
|
||||
async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
|
||||
}
|
||||
|
||||
/// Every storage, currently supported.
|
||||
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
|
||||
pub enum GenericRemoteStorage {
|
||||
Local(LocalFs),
|
||||
S3(S3Bucket),
|
||||
}
|
||||
|
||||
impl GenericRemoteStorage {
|
||||
pub fn new(
|
||||
working_directory: PathBuf,
|
||||
storage_config: &RemoteStorageConfig,
|
||||
) -> anyhow::Result<Self> {
|
||||
match &storage_config.storage {
|
||||
RemoteStorageKind::LocalFs(root) => {
|
||||
info!("Using fs root '{}' as a remote storage", root.display());
|
||||
LocalFs::new(root.clone(), working_directory).map(GenericRemoteStorage::Local)
|
||||
}
|
||||
RemoteStorageKind::AwsS3(s3_config) => {
|
||||
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}'",
|
||||
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
|
||||
S3Bucket::new(s3_config, working_directory).map(GenericRemoteStorage::S3)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.
|
||||
/// Immutable, cannot be changed once the file is created.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct StorageMetadata(HashMap<String, String>);
|
||||
|
||||
fn strip_path_prefix<'a>(prefix: &'a Path, path: &'a Path) -> anyhow::Result<&'a Path> {
|
||||
if prefix == path {
|
||||
anyhow::bail!(
|
||||
"Prefix and the path are equal, cannot strip: '{}'",
|
||||
prefix.display()
|
||||
)
|
||||
} else {
|
||||
path.strip_prefix(prefix).with_context(|| {
|
||||
format!(
|
||||
"Path '{}' is not prefixed with '{}'",
|
||||
path.display(),
|
||||
prefix.display(),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between the API user and the remote storage.
|
||||
pub max_concurrent_syncs: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple remote storage users at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
/// AWS S3 has various limits on its API calls, we need not to exceed those.
|
||||
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
|
||||
pub concurrency_limit: NonZeroUsize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.field("concurrency_limit", &self.concurrency_limit)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
|
||||
let new_extension = match original_path
|
||||
.as_ref()
|
||||
.extension()
|
||||
.map(OsStr::to_string_lossy)
|
||||
{
|
||||
Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
|
||||
None => Cow::Borrowed(suffix),
|
||||
};
|
||||
original_path
|
||||
.as_ref()
|
||||
.with_extension(new_extension.as_ref())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_path_with_suffix_extension() {
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp").to_string_lossy(),
|
||||
"/foo/bar.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, "temp.temp").to_string_lossy(),
|
||||
"/foo/bar.baz.temp.temp"
|
||||
);
|
||||
let p = PathBuf::from("/foo/bar.baz");
|
||||
assert_eq!(
|
||||
&path_with_suffix_extension(&p, ".temp").to_string_lossy(),
|
||||
"/foo/bar.baz..temp"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,105 +0,0 @@
|
||||
//! `utils` is intended to be a place to put code that is shared
|
||||
//! between other crates in this repository.
|
||||
|
||||
#![allow(clippy::manual_range_contains)]
|
||||
|
||||
/// `Lsn` type implements common tasks on Log Sequence Numbers
|
||||
pub mod lsn;
|
||||
/// SeqWait allows waiting for a future sequence number to arrive
|
||||
pub mod seqwait;
|
||||
|
||||
/// append only ordered map implemented with a Vec
|
||||
pub mod vec_map;
|
||||
|
||||
// Async version of SeqWait. Currently unused.
|
||||
// pub mod seqwait_async;
|
||||
|
||||
pub mod bin_ser;
|
||||
pub mod postgres_backend;
|
||||
pub mod pq_proto;
|
||||
|
||||
// dealing with connstring parsing and handy access to it's parts
|
||||
pub mod connstring;
|
||||
|
||||
// helper functions for creating and fsyncing directories/trees
|
||||
pub mod crashsafe_dir;
|
||||
|
||||
// common authentication routines
|
||||
pub mod auth;
|
||||
|
||||
// utility functions and helper traits for unified unique id generation/serialization etc.
|
||||
pub mod zid;
|
||||
// http endpoint utils
|
||||
pub mod http;
|
||||
|
||||
// socket splitting utils
|
||||
pub mod sock_split;
|
||||
|
||||
// common log initialisation routine
|
||||
pub mod logging;
|
||||
|
||||
// Misc
|
||||
pub mod accum;
|
||||
pub mod shutdown;
|
||||
|
||||
// Tools for calling certain async methods in sync contexts
|
||||
pub mod sync;
|
||||
|
||||
// Utility for binding TcpListeners with proper socket options.
|
||||
pub mod tcp_listener;
|
||||
|
||||
// Utility for putting a raw file descriptor into non-blocking mode
|
||||
pub mod nonblock;
|
||||
|
||||
// Default signal handling
|
||||
pub mod signals;
|
||||
|
||||
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
|
||||
///
|
||||
/// we have several cases:
|
||||
/// * building locally from git repo
|
||||
/// * building in CI from git repo
|
||||
/// * building in docker (either in CI or locally)
|
||||
///
|
||||
/// One thing to note is that .git is not available in docker (and it is bad to include it there).
|
||||
/// So everything becides docker build is covered by git_version crate, and docker uses a `GIT_VERSION` argument to get the value required.
|
||||
/// It takes variable from build process env and puts it to the rustc env. And then we can retrieve it here by using env! macro.
|
||||
/// Git version received from environment variable used as a fallback in git_version invokation.
|
||||
/// And to avoid running buildscript every recompilation, we use rerun-if-env-changed option.
|
||||
/// So the build script will be run only when GIT_VERSION envvar has changed.
|
||||
///
|
||||
/// Why not to use buildscript to get git commit sha directly without procmacro from different crate?
|
||||
/// Caching and workspaces complicates that. In case `utils` is not
|
||||
/// recompiled due to caching then version may become outdated.
|
||||
/// git_version crate handles that case by introducing a dependency on .git internals via include_bytes! macro,
|
||||
/// so if we changed the index state git_version will pick that up and rerun the macro.
|
||||
///
|
||||
/// Note that with git_version prefix is `git:` and in case of git version from env its `git-env:`.
|
||||
///
|
||||
/// #############################################################################################
|
||||
/// TODO this macro is not the way the library is intended to be used, see https://github.com/neondatabase/neon/issues/1565 for details.
|
||||
/// We use `cachepot` to reduce our current CI build times: https://github.com/neondatabase/cloud/pull/1033#issuecomment-1100935036
|
||||
/// Yet, it seems to ignore the GIT_VERSION env variable, passed to Docker build, even with build.rs that contains
|
||||
/// `println!("cargo:rerun-if-env-changed=GIT_VERSION");` code for cachepot cache invalidation.
|
||||
/// The problem needs further investigation and regular `const` declaration instead of a macro.
|
||||
#[macro_export]
|
||||
macro_rules! project_git_version {
|
||||
($const_identifier:ident) => {
|
||||
const $const_identifier: &str = git_version::git_version!(
|
||||
prefix = "git:",
|
||||
fallback = concat!(
|
||||
"git-env:",
|
||||
env!("GIT_VERSION", "Missing GIT_VERSION envvar")
|
||||
),
|
||||
args = ["--abbrev=40", "--always", "--dirty=-modified"] // always use full sha
|
||||
);
|
||||
};
|
||||
}
|
||||
|
||||
/// Same as `assert!`, but evaluated during compilation and gets optimized out in runtime.
|
||||
#[macro_export]
|
||||
macro_rules! const_assert {
|
||||
($($args:tt)*) => {
|
||||
const _: () = assert!($($args)*);
|
||||
};
|
||||
}
|
||||
25
monitoring/docker-compose.yml
Normal file
25
monitoring/docker-compose.yml
Normal file
@@ -0,0 +1,25 @@
|
||||
version: "3"
|
||||
services:
|
||||
|
||||
prometheus:
|
||||
container_name: prometheus
|
||||
image: prom/prometheus:latest
|
||||
volumes:
|
||||
- ./prometheus.yaml:/etc/prometheus/prometheus.yml
|
||||
# ports:
|
||||
# - "9090:9090"
|
||||
# TODO: find a proper portable solution
|
||||
network_mode: "host"
|
||||
|
||||
grafana:
|
||||
image: grafana/grafana:latest
|
||||
volumes:
|
||||
- ./grafana.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
|
||||
environment:
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
|
||||
- GF_AUTH_DISABLE_LOGIN_FORM=true
|
||||
# ports:
|
||||
# - "3000:3000"
|
||||
# TODO: find a proper portable solution
|
||||
network_mode: "host"
|
||||
12
monitoring/grafana.yaml
Normal file
12
monitoring/grafana.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
orgId: 1
|
||||
url: http://localhost:9090
|
||||
basicAuth: false
|
||||
isDefault: false
|
||||
version: 1
|
||||
editable: false
|
||||
5
monitoring/prometheus.yaml
Normal file
5
monitoring/prometheus.yaml
Normal file
@@ -0,0 +1,5 @@
|
||||
scrape_configs:
|
||||
- job_name: 'default'
|
||||
scrape_interval: 10s
|
||||
static_configs:
|
||||
- targets: ['localhost:9898']
|
||||
@@ -3,14 +3,6 @@ name = "pageserver"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[features]
|
||||
# It is simpler infra-wise to have failpoints enabled by default
|
||||
# It shouldn't affect perf in any way because failpoints
|
||||
# are not placed in hot code paths
|
||||
default = ["failpoints"]
|
||||
profiling = ["pprof"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
|
||||
[dependencies]
|
||||
chrono = "0.4.19"
|
||||
rand = "0.8.3"
|
||||
@@ -22,13 +14,15 @@ hex = "0.4.3"
|
||||
hyper = "0.14"
|
||||
itertools = "0.10.3"
|
||||
lazy_static = "1.4.0"
|
||||
log = "0.4.14"
|
||||
clap = "3.0"
|
||||
daemonize = "0.4.1"
|
||||
tokio = { version = "1.17", features = ["process", "sync", "macros", "fs", "rt", "io-util", "time"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
|
||||
tokio-util = { version = "0.7", features = ["io"] }
|
||||
postgres-types = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="2949d98df52587d562986aad155dd4e889e408b7" }
|
||||
tokio-stream = "0.1.8"
|
||||
anyhow = { version = "1.0", features = ["backtrace"] }
|
||||
crc32c = "0.6.0"
|
||||
@@ -38,27 +32,27 @@ humantime = "2.1.0"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
serde_with = "1.12.0"
|
||||
humantime-serde = "1.1.1"
|
||||
|
||||
pprof = { git = "https://github.com/neondatabase/pprof-rs.git", branch = "wallclock-profiling", features = ["flamegraph"], optional = true }
|
||||
|
||||
toml_edit = { version = "0.13", features = ["easy"] }
|
||||
scopeguard = "1.1.0"
|
||||
const_format = "0.2.21"
|
||||
tracing = "0.1.27"
|
||||
tracing-futures = "0.2"
|
||||
signal-hook = "0.3.10"
|
||||
url = "2"
|
||||
nix = "0.23"
|
||||
once_cell = "1.8.0"
|
||||
crossbeam-utils = "0.8.5"
|
||||
fail = "0.5.0"
|
||||
git-version = "0.3.5"
|
||||
|
||||
postgres_ffi = { path = "../libs/postgres_ffi" }
|
||||
etcd_broker = { path = "../libs/etcd_broker" }
|
||||
metrics = { path = "../libs/metrics" }
|
||||
utils = { path = "../libs/utils" }
|
||||
remote_storage = { path = "../libs/remote_storage" }
|
||||
rusoto_core = "0.47"
|
||||
rusoto_s3 = "0.47"
|
||||
async-trait = "0.1"
|
||||
async-compression = {version = "0.3", features = ["zstd", "tokio"]}
|
||||
|
||||
postgres_ffi = { path = "../postgres_ffi" }
|
||||
zenith_metrics = { path = "../zenith_metrics" }
|
||||
zenith_utils = { path = "../zenith_utils" }
|
||||
workspace_hack = { version = "0.1", path = "../workspace_hack" }
|
||||
|
||||
[dev-dependencies]
|
||||
|
||||
@@ -135,7 +135,7 @@ The backup service is disabled by default and can be enabled to interact with a
|
||||
|
||||
CLI examples:
|
||||
* Local FS: `${PAGESERVER_BIN} -c "remote_storage={local_path='/some/local/path/'}"`
|
||||
* AWS S3 : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
|
||||
* AWS S3 : `${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/',access_key_id='SOMEKEYAAAAASADSAH*#',secret_access_key='SOMEsEcReTsd292v'}"`
|
||||
|
||||
For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
|
||||
For local S3 installations, refer to the their documentation for name format and credentials.
|
||||
@@ -155,9 +155,11 @@ or
|
||||
bucket_name = 'some-sample-bucket'
|
||||
bucket_region = 'eu-north-1'
|
||||
prefix_in_bucket = '/test_prefix/'
|
||||
access_key_id = 'SOMEKEYAAAAASADSAH*#'
|
||||
secret_access_key = 'SOMEsEcReTsd292v'
|
||||
```
|
||||
|
||||
`AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.
|
||||
Also, `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` variables can be used to specify the credentials instead of any of the ways above.
|
||||
|
||||
TODO: Sharding
|
||||
--------------------
|
||||
|
||||
@@ -10,37 +10,31 @@
|
||||
//! This module is responsible for creation of such tarball
|
||||
//! from data stored in object storage.
|
||||
//!
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use anyhow::{ensure, Context, Result};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use fail::fail_point;
|
||||
use log::*;
|
||||
use std::fmt::Write as FmtWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::sync::Arc;
|
||||
use std::time::SystemTime;
|
||||
use tar::{Builder, EntryType, Header};
|
||||
use tracing::*;
|
||||
|
||||
use crate::reltag::SlruKind;
|
||||
use crate::repository::Timeline;
|
||||
use crate::DatadirTimelineImpl;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::*;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
/// This is short-living object only for the time of tarball creation,
|
||||
/// created mostly to avoid passing a lot of parameters between various functions
|
||||
/// used for constructing tarball.
|
||||
pub struct Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
ar: Builder<AbortableWrite<W>>,
|
||||
pub struct Basebackup<'a> {
|
||||
ar: Builder<&'a mut dyn Write>,
|
||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||
pub lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
|
||||
finished: bool,
|
||||
}
|
||||
|
||||
// Create basebackup with non-rel data in it. Omit relational data.
|
||||
@@ -50,15 +44,12 @@ where
|
||||
// * When working without safekeepers. In this situation it is important to match the lsn
|
||||
// we are taking basebackup on with the lsn that is used in pageserver's walreceiver
|
||||
// to start the replication.
|
||||
impl<'a, W> Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
impl<'a> Basebackup<'a> {
|
||||
pub fn new(
|
||||
write: W,
|
||||
write: &'a mut dyn Write,
|
||||
timeline: &'a Arc<DatadirTimelineImpl>,
|
||||
req_lsn: Option<Lsn>,
|
||||
) -> Result<Basebackup<'a, W>> {
|
||||
) -> Result<Basebackup<'a>> {
|
||||
// Compute postgres doesn't have any previous WAL files, but the first
|
||||
// record that it's going to write needs to include the LSN of the
|
||||
// previous record (xl_prev). We include prev_record_lsn in the
|
||||
@@ -99,15 +90,14 @@ where
|
||||
);
|
||||
|
||||
Ok(Basebackup {
|
||||
ar: Builder::new(AbortableWrite::new(write)),
|
||||
ar: Builder::new(write),
|
||||
timeline,
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: backup_prev,
|
||||
finished: false,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn send_tarball(mut self) -> anyhow::Result<()> {
|
||||
pub fn send_tarball(&mut self) -> anyhow::Result<()> {
|
||||
// Create pgdata subdirs structure
|
||||
for dir in pg_constants::PGDATA_SUBDIRS.iter() {
|
||||
let header = new_tar_header_dir(*dir)?;
|
||||
@@ -145,14 +135,9 @@ where
|
||||
self.add_twophase_file(xid)?;
|
||||
}
|
||||
|
||||
fail_point!("basebackup-before-control-file", |_| {
|
||||
bail!("failpoint basebackup-before-control-file")
|
||||
});
|
||||
|
||||
// Generate pg_control and bootstrap WAL segment.
|
||||
self.add_pgcontrol_file()?;
|
||||
self.ar.finish()?;
|
||||
self.finished = true;
|
||||
debug!("all tarred up!");
|
||||
Ok(())
|
||||
}
|
||||
@@ -169,17 +154,9 @@ where
|
||||
let img = self
|
||||
.timeline
|
||||
.get_slru_page_at_lsn(slru, segno, blknum, self.lsn)?;
|
||||
ensure!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
|
||||
if slru == SlruKind::Clog {
|
||||
ensure!(
|
||||
img.len() == pg_constants::BLCKSZ as usize
|
||||
|| img.len() == pg_constants::BLCKSZ as usize + 8
|
||||
);
|
||||
} else {
|
||||
ensure!(img.len() == pg_constants::BLCKSZ as usize);
|
||||
}
|
||||
|
||||
slru_buf.extend_from_slice(&img[..pg_constants::BLCKSZ as usize]);
|
||||
slru_buf.extend_from_slice(&img);
|
||||
}
|
||||
|
||||
let segname = format!("{}/{:>04X}", slru.to_str(), segno);
|
||||
@@ -338,27 +315,13 @@ where
|
||||
let wal_file_name = XLogFileName(PG_TLI, segno, pg_constants::WAL_SEGMENT_SIZE);
|
||||
let wal_file_path = format!("pg_wal/{}", wal_file_name);
|
||||
let header = new_tar_header(&wal_file_path, pg_constants::WAL_SEGMENT_SIZE as u64)?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier)
|
||||
.map_err(|e| anyhow!(e).context("Failed generating wal segment"))?;
|
||||
let wal_seg = generate_wal_segment(segno, pg_control.system_identifier);
|
||||
ensure!(wal_seg.len() == pg_constants::WAL_SEGMENT_SIZE);
|
||||
self.ar.append(&header, &wal_seg[..])?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W> Drop for Basebackup<'a, W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
/// If the basebackup was not finished, prevent the Archive::drop() from
|
||||
/// writing the end-of-archive marker.
|
||||
fn drop(&mut self) {
|
||||
if !self.finished {
|
||||
self.ar.get_mut().abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Create new tarball entry header
|
||||
//
|
||||
@@ -394,49 +357,3 @@ fn new_tar_header_dir(path: &str) -> anyhow::Result<Header> {
|
||||
header.set_cksum();
|
||||
Ok(header)
|
||||
}
|
||||
|
||||
/// A wrapper that passes through all data to the underlying Write,
|
||||
/// until abort() is called.
|
||||
///
|
||||
/// tar::Builder has an annoying habit of finishing the archive with
|
||||
/// a valid tar end-of-archive marker (two 512-byte sectors of zeros),
|
||||
/// even if an error occurs and we don't finish building the archive.
|
||||
/// We'd rather abort writing the tarball immediately than construct
|
||||
/// a seemingly valid but incomplete archive. This wrapper allows us
|
||||
/// to swallow the end-of-archive marker that Builder::drop() emits,
|
||||
/// without writing it to the underlying sink.
|
||||
///
|
||||
struct AbortableWrite<W> {
|
||||
w: W,
|
||||
aborted: bool,
|
||||
}
|
||||
|
||||
impl<W> AbortableWrite<W> {
|
||||
pub fn new(w: W) -> Self {
|
||||
AbortableWrite { w, aborted: false }
|
||||
}
|
||||
|
||||
pub fn abort(&mut self) {
|
||||
self.aborted = true;
|
||||
}
|
||||
}
|
||||
|
||||
impl<W> Write for AbortableWrite<W>
|
||||
where
|
||||
W: Write,
|
||||
{
|
||||
fn write(&mut self, data: &[u8]) -> io::Result<usize> {
|
||||
if self.aborted {
|
||||
Ok(data.len())
|
||||
} else {
|
||||
self.w.write(data)
|
||||
}
|
||||
}
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
if self.aborted {
|
||||
Ok(())
|
||||
} else {
|
||||
self.w.flush()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,9 +7,7 @@ use pageserver::layered_repository::dump_layerfile_from_path;
|
||||
use pageserver::page_cache;
|
||||
use pageserver::virtual_file;
|
||||
use std::path::PathBuf;
|
||||
use utils::project_git_version;
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith dump_layerfile utility")
|
||||
|
||||
@@ -1,46 +1,39 @@
|
||||
//! Main entry point for the Page Server executable.
|
||||
|
||||
use std::{env, fs::File, path::Path, process, str::FromStr, thread::sleep, time::Duration};
|
||||
use std::{env, path::Path, str::FromStr};
|
||||
use tracing::*;
|
||||
use zenith_utils::{
|
||||
auth::JwtAuth,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
tcp_listener,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
GIT_VERSION,
|
||||
};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
|
||||
use clap::{App, Arg};
|
||||
use daemonize::Daemonize;
|
||||
|
||||
use fail::FailScenario;
|
||||
use pageserver::{
|
||||
config::{defaults::*, PageServerConf},
|
||||
http, page_cache, page_service, profiling, tenant_mgr, thread_mgr,
|
||||
http, page_cache, page_service,
|
||||
remote_storage::{self, SyncStartupData},
|
||||
repository::{Repository, TimelineSyncStatusUpdate},
|
||||
tenant_mgr, thread_mgr,
|
||||
thread_mgr::ThreadKind,
|
||||
timelines, virtual_file, LOG_FILE_NAME,
|
||||
};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::endpoint,
|
||||
logging,
|
||||
postgres_backend::AuthType,
|
||||
project_git_version,
|
||||
shutdown::exit_now,
|
||||
signals::{self, Signal},
|
||||
tcp_listener,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
|
||||
fn version() -> String {
|
||||
format!(
|
||||
"{GIT_VERSION} profiling:{} failpoints:{}",
|
||||
cfg!(feature = "profiling"),
|
||||
fail::has_failpoints()
|
||||
)
|
||||
}
|
||||
use zenith_utils::http::endpoint;
|
||||
use zenith_utils::shutdown::exit_now;
|
||||
use zenith_utils::signals::{self, Signal};
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
zenith_metrics::set_common_metrics_prefix("pageserver");
|
||||
let arg_matches = App::new("Zenith page server")
|
||||
.about("Materializes WAL stream to pages and serves them to the postgres")
|
||||
.version(&*version())
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("daemonize")
|
||||
.short('d')
|
||||
@@ -85,25 +78,8 @@ fn main() -> anyhow::Result<()> {
|
||||
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there).
|
||||
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("enabled-features")
|
||||
.long("enabled-features")
|
||||
.takes_value(false)
|
||||
.help("Show enabled compile time features"),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
if arg_matches.is_present("enabled-features") {
|
||||
let features: &[&str] = &[
|
||||
#[cfg(feature = "failpoints")]
|
||||
"failpoints",
|
||||
#[cfg(feature = "profiling")]
|
||||
"profiling",
|
||||
];
|
||||
println!("{{\"features\": {features:?} }}");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let workdir = Path::new(arg_matches.value_of("workdir").unwrap_or(".zenith"));
|
||||
let workdir = workdir
|
||||
.canonicalize()
|
||||
@@ -184,9 +160,6 @@ fn main() -> anyhow::Result<()> {
|
||||
// as a ref.
|
||||
let conf: &'static PageServerConf = Box::leak(Box::new(conf));
|
||||
|
||||
// Initialize up failpoints support
|
||||
let scenario = FailScenario::setup();
|
||||
|
||||
// Basic initialization of things that don't change after startup
|
||||
virtual_file::init(conf.max_file_descriptors);
|
||||
page_cache::init(conf.page_cache_size);
|
||||
@@ -202,19 +175,20 @@ fn main() -> anyhow::Result<()> {
|
||||
cfg_file_path.display()
|
||||
)
|
||||
})?;
|
||||
Ok(())
|
||||
} else {
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")?;
|
||||
start_pageserver(conf, daemonize).context("Failed to start pageserver")
|
||||
}
|
||||
|
||||
scenario.teardown();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()> {
|
||||
// Initialize logger
|
||||
let log_file = logging::init(LOG_FILE_NAME, daemonize)?;
|
||||
|
||||
info!("version: {GIT_VERSION}");
|
||||
// TODO init only if configured
|
||||
pageserver::wal_metadata::init(conf).expect("wal_metadata init failed");
|
||||
|
||||
info!("version: {}", GIT_VERSION);
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
@@ -254,14 +228,52 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
// Otherwise, the coverage data will be damaged.
|
||||
match daemonize.exit_action(|| exit_now(0)).start() {
|
||||
Ok(_) => info!("Success, daemonized"),
|
||||
Err(err) => bail!("{err}. could not daemonize. bailing."),
|
||||
Err(err) => error!(%err, "could not daemonize"),
|
||||
}
|
||||
}
|
||||
|
||||
let signals = signals::install_shutdown_handlers()?;
|
||||
|
||||
// start profiler (if enabled)
|
||||
let profiler_guard = profiling::init_profiler(conf);
|
||||
// Initialize repositories with locally available timelines.
|
||||
// Timelines that are only partially available locally (remote storage has more data than this pageserver)
|
||||
// are scheduled for download and added to the repository once download is completed.
|
||||
let SyncStartupData {
|
||||
remote_index,
|
||||
local_timeline_init_statuses,
|
||||
} = remote_storage::start_local_timeline_sync(conf)
|
||||
.context("Failed to set up local files sync with external storage")?;
|
||||
|
||||
for (tenant_id, local_timeline_init_statuses) in local_timeline_init_statuses {
|
||||
// initialize local tenant
|
||||
let repo = tenant_mgr::load_local_repo(conf, tenant_id, &remote_index);
|
||||
for (timeline_id, init_status) in local_timeline_init_statuses {
|
||||
match init_status {
|
||||
remote_storage::LocalTimelineInitStatus::LocallyComplete => {
|
||||
debug!("timeline {} for tenant {} is locally complete, registering it in repository", tenant_id, timeline_id);
|
||||
// Lets fail here loudly to be on the safe side.
|
||||
// XXX: It may be a better api to actually distinguish between repository startup
|
||||
// and processing of newly downloaded timelines.
|
||||
repo.apply_timeline_remote_sync_status_update(
|
||||
timeline_id,
|
||||
TimelineSyncStatusUpdate::Downloaded,
|
||||
)
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"Failed to bootstrap timeline {} for tenant {}",
|
||||
timeline_id, tenant_id
|
||||
)
|
||||
})?
|
||||
}
|
||||
remote_storage::LocalTimelineInitStatus::NeedsSync => {
|
||||
debug!(
|
||||
"timeline {} for tenant {} needs sync, \
|
||||
so skipped for adding into repository until sync is finished",
|
||||
tenant_id, timeline_id
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// initialize authentication for incoming connections
|
||||
let auth = match &conf.auth_type {
|
||||
@@ -274,24 +286,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
};
|
||||
info!("Using auth: {:#?}", conf.auth_type);
|
||||
|
||||
let remote_index = tenant_mgr::init_tenant_mgr(conf)?;
|
||||
|
||||
// Create file and frequently check if it's still here
|
||||
thread_mgr::spawn(
|
||||
ThreadKind::HttpEndpointListener,
|
||||
None,
|
||||
None,
|
||||
"http_endpoint_thread",
|
||||
true,
|
||||
move || {
|
||||
File::create("delete-me.txt").expect("FFFF failed creating file");
|
||||
loop {
|
||||
File::open("delete-me.txt").expect("FFFF cannot find file");
|
||||
sleep(Duration::from_millis(1));
|
||||
}
|
||||
},
|
||||
)?;
|
||||
|
||||
// Spawn a new thread for the http endpoint
|
||||
// bind before launching separate thread so the error reported before startup exits
|
||||
let auth_cloned = auth.clone();
|
||||
@@ -300,9 +294,9 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
None,
|
||||
None,
|
||||
"http_endpoint_thread",
|
||||
true,
|
||||
false,
|
||||
move || {
|
||||
let router = http::make_router(conf, auth_cloned, remote_index)?;
|
||||
let router = http::make_router(conf, auth_cloned, remote_index);
|
||||
endpoint::serve_thread_main(router, http_listener, thread_mgr::shutdown_watcher())
|
||||
},
|
||||
)?;
|
||||
@@ -314,7 +308,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
None,
|
||||
None,
|
||||
"libpq endpoint thread",
|
||||
true,
|
||||
false,
|
||||
move || page_service::thread_main(conf, auth, pageserver_listener, conf.auth_type),
|
||||
)?;
|
||||
|
||||
@@ -324,7 +318,6 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"Got {}. Terminating in immediate shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
std::process::exit(111);
|
||||
}
|
||||
|
||||
@@ -333,8 +326,7 @@ fn start_pageserver(conf: &'static PageServerConf, daemonize: bool) -> Result<()
|
||||
"Got {}. Terminating gracefully in fast shutdown mode",
|
||||
signal.name()
|
||||
);
|
||||
profiling::exit_profiler(conf, &profiler_guard);
|
||||
pageserver::shutdown_pageserver(0);
|
||||
pageserver::shutdown_pageserver();
|
||||
unreachable!()
|
||||
}
|
||||
})
|
||||
|
||||
334
pageserver/src/bin/pageserver_zst.rs
Normal file
334
pageserver/src/bin/pageserver_zst.rs
Normal file
@@ -0,0 +1,334 @@
|
||||
//! A CLI helper to deal with remote storage (S3, usually) blobs as archives.
|
||||
//! See [`compression`] for more details about the archives.
|
||||
|
||||
use std::{collections::BTreeSet, path::Path};
|
||||
|
||||
use anyhow::{bail, ensure, Context};
|
||||
use clap::{App, Arg};
|
||||
use pageserver::{
|
||||
layered_repository::metadata::{TimelineMetadata, METADATA_FILE_NAME},
|
||||
remote_storage::compression,
|
||||
};
|
||||
use tokio::{fs, io};
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
const LIST_SUBCOMMAND: &str = "list";
|
||||
const ARCHIVE_ARG_NAME: &str = "archive";
|
||||
|
||||
const EXTRACT_SUBCOMMAND: &str = "extract";
|
||||
const TARGET_DIRECTORY_ARG_NAME: &str = "target_directory";
|
||||
|
||||
const CREATE_SUBCOMMAND: &str = "create";
|
||||
const SOURCE_DIRECTORY_ARG_NAME: &str = "source_directory";
|
||||
|
||||
#[tokio::main(flavor = "current_thread")]
|
||||
async fn main() -> anyhow::Result<()> {
|
||||
let arg_matches = App::new("pageserver zst blob [un]compressor utility")
|
||||
.version(GIT_VERSION)
|
||||
.subcommands(vec![
|
||||
App::new(LIST_SUBCOMMAND)
|
||||
.about("List the archive contents")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to list the contents of"),
|
||||
),
|
||||
App::new(EXTRACT_SUBCOMMAND)
|
||||
.about("Extracts the archive into the directory")
|
||||
.arg(
|
||||
Arg::new(ARCHIVE_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("An archive to extract"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to extract the archive into. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
App::new(CREATE_SUBCOMMAND)
|
||||
.about("Creates an archive with the contents of a directory (only the first level files are taken, metadata file has to be present in the same directory)")
|
||||
.arg(
|
||||
Arg::new(SOURCE_DIRECTORY_ARG_NAME)
|
||||
.required(true)
|
||||
.takes_value(true)
|
||||
.help("A directory to use for creating the archive"),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(TARGET_DIRECTORY_ARG_NAME)
|
||||
.required(false)
|
||||
.takes_value(true)
|
||||
.help("A directory to create the archive in. Optional, will use the current directory if not specified"),
|
||||
),
|
||||
])
|
||||
.get_matches();
|
||||
|
||||
let subcommand_name = match arg_matches.subcommand_name() {
|
||||
Some(name) => name,
|
||||
None => bail!("No subcommand specified"),
|
||||
};
|
||||
|
||||
let subcommand_matches = match arg_matches.subcommand_matches(subcommand_name) {
|
||||
Some(matches) => matches,
|
||||
None => bail!(
|
||||
"No subcommand arguments were recognized for subcommand '{}'",
|
||||
subcommand_name
|
||||
),
|
||||
};
|
||||
|
||||
let target_dir = Path::new(
|
||||
subcommand_matches
|
||||
.value_of(TARGET_DIRECTORY_ARG_NAME)
|
||||
.unwrap_or("./"),
|
||||
);
|
||||
|
||||
match subcommand_name {
|
||||
LIST_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
list_archive(archive).await
|
||||
}
|
||||
EXTRACT_SUBCOMMAND => {
|
||||
let archive = match subcommand_matches.value_of(ARCHIVE_ARG_NAME) {
|
||||
Some(archive) => Path::new(archive),
|
||||
None => bail!("No '{}' argument is specified", ARCHIVE_ARG_NAME),
|
||||
};
|
||||
extract_archive(archive, target_dir).await
|
||||
}
|
||||
CREATE_SUBCOMMAND => {
|
||||
let source_dir = match subcommand_matches.value_of(SOURCE_DIRECTORY_ARG_NAME) {
|
||||
Some(source) => Path::new(source),
|
||||
None => bail!("No '{}' argument is specified", SOURCE_DIRECTORY_ARG_NAME),
|
||||
};
|
||||
create_archive(source_dir, target_dir).await
|
||||
}
|
||||
unknown => bail!("Unknown subcommand {}", unknown),
|
||||
}
|
||||
}
|
||||
|
||||
async fn list_archive(archive: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
println!("Listing an archive at path '{}'", archive.display());
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
let archive_bytes = fs::read(&archive)
|
||||
.await
|
||||
.context("Failed to read the archive bytes")?;
|
||||
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_bytes.as_slice())
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
|
||||
let empty_path = Path::new("");
|
||||
println!("-------------------------------");
|
||||
|
||||
let longest_path_in_archive = header
|
||||
.files
|
||||
.iter()
|
||||
.filter_map(|file| Some(file.subpath.as_path(empty_path).to_str()?.len()))
|
||||
.max()
|
||||
.unwrap_or_default()
|
||||
.max(METADATA_FILE_NAME.len());
|
||||
|
||||
for regular_file in &header.files {
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
regular_file.subpath.as_path(empty_path).display(),
|
||||
regular_file.size,
|
||||
width = longest_path_in_archive,
|
||||
)
|
||||
}
|
||||
println!(
|
||||
"File: {:width$} uncompressed size: {} bytes",
|
||||
METADATA_FILE_NAME,
|
||||
header.metadata_file_size,
|
||||
width = longest_path_in_archive,
|
||||
);
|
||||
println!("-------------------------------");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn extract_archive(archive: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let archive = archive.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the archive path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
archive.is_file(),
|
||||
"Path '{}' is not an archive file",
|
||||
archive.display()
|
||||
);
|
||||
let archive_name = match archive.file_name().and_then(|name| name.to_str()) {
|
||||
Some(name) => name,
|
||||
None => bail!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
),
|
||||
};
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
let mut dir_contents = fs::read_dir(&target_dir)
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
let dir_entry = dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to list the target directory contents")?;
|
||||
ensure!(
|
||||
dir_entry.is_none(),
|
||||
"Target directory '{}' is not empty",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Extracting an archive at path '{}' into directory '{}'",
|
||||
archive.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut archive_file = fs::File::open(&archive).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to get the archive name from the path '{}'",
|
||||
archive.display()
|
||||
)
|
||||
})?;
|
||||
let header = compression::read_archive_header(archive_name, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to read the archive header")?;
|
||||
compression::uncompress_with_header(&BTreeSet::new(), &target_dir, header, &mut archive_file)
|
||||
.await
|
||||
.context("Failed to extract the archive")
|
||||
}
|
||||
|
||||
async fn create_archive(source_dir: &Path, target_dir: &Path) -> anyhow::Result<()> {
|
||||
let source_dir = source_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the source dir path '{}'",
|
||||
source_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
source_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
source_dir.display()
|
||||
);
|
||||
|
||||
if !target_dir.exists() {
|
||||
fs::create_dir_all(target_dir).await.with_context(|| {
|
||||
format!(
|
||||
"Failed to create the target dir at path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
}
|
||||
let target_dir = target_dir.canonicalize().with_context(|| {
|
||||
format!(
|
||||
"Failed to get the absolute path for the target dir path '{}'",
|
||||
target_dir.display()
|
||||
)
|
||||
})?;
|
||||
ensure!(
|
||||
target_dir.is_dir(),
|
||||
"Path '{}' is not a directory",
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
println!(
|
||||
"Compressing directory '{}' and creating resulting archive in directory '{}'",
|
||||
source_dir.display(),
|
||||
target_dir.display()
|
||||
);
|
||||
|
||||
let mut metadata_file_contents = None;
|
||||
let mut files_co_archive = Vec::new();
|
||||
|
||||
let mut source_dir_contents = fs::read_dir(&source_dir)
|
||||
.await
|
||||
.context("Failed to read the source directory contents")?;
|
||||
|
||||
while let Some(source_dir_entry) = source_dir_contents
|
||||
.next_entry()
|
||||
.await
|
||||
.context("Failed to read a source dir entry")?
|
||||
{
|
||||
let entry_path = source_dir_entry.path();
|
||||
if entry_path.is_file() {
|
||||
if entry_path.file_name().and_then(|name| name.to_str()) == Some(METADATA_FILE_NAME) {
|
||||
let metadata_bytes = fs::read(entry_path)
|
||||
.await
|
||||
.context("Failed to read metata file bytes in the source dir")?;
|
||||
metadata_file_contents = Some(
|
||||
TimelineMetadata::from_bytes(&metadata_bytes)
|
||||
.context("Failed to parse metata file contents in the source dir")?,
|
||||
);
|
||||
} else {
|
||||
files_co_archive.push(entry_path);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let metadata = match metadata_file_contents {
|
||||
Some(metadata) => metadata,
|
||||
None => bail!(
|
||||
"No metadata file found in the source dir '{}', cannot create the archive",
|
||||
source_dir.display()
|
||||
),
|
||||
};
|
||||
|
||||
let _ = compression::archive_files_as_stream(
|
||||
&source_dir,
|
||||
files_co_archive.iter(),
|
||||
&metadata,
|
||||
move |mut archive_streamer, archive_name| async move {
|
||||
let archive_target = target_dir.join(&archive_name);
|
||||
let mut archive_file = fs::File::create(&archive_target).await?;
|
||||
io::copy(&mut archive_streamer, &mut archive_file).await?;
|
||||
Ok(archive_target)
|
||||
},
|
||||
)
|
||||
.await
|
||||
.context("Failed to create an archive")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
158
pageserver/src/bin/psbench.rs
Normal file
158
pageserver/src/bin/psbench.rs
Normal file
@@ -0,0 +1,158 @@
|
||||
//! Pageserver benchmark tool
|
||||
//!
|
||||
//! Usually it's easier to write python perf tests, but here the performance
|
||||
//! of the tester matters, and the API is easier to work with from rust.
|
||||
use std::{collections::HashSet, io::{BufRead, BufReader, Cursor}, time::Duration};
|
||||
use pageserver::wal_metadata::{Page, WalEntryMetadata};
|
||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use clap::{App, Arg};
|
||||
use std::fs::File;
|
||||
use zenith_utils::{GIT_VERSION, lsn::Lsn, pq_proto::{BeMessage, FeMessage}};
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
const BYTES_IN_PAGE: usize = 8 * 1024;
|
||||
|
||||
pub fn read_lines_buffered(file_name: &str) -> impl Iterator<Item = String> {
|
||||
BufReader::new(File::open(file_name).unwrap())
|
||||
.lines()
|
||||
.map(|result| result.unwrap())
|
||||
}
|
||||
|
||||
pub async fn get_page(
|
||||
pagestream: &mut tokio::net::TcpStream,
|
||||
lsn: &Lsn,
|
||||
page: &Page,
|
||||
latest: bool,
|
||||
) -> anyhow::Result<Vec<u8>> {
|
||||
let latest: u8 = if latest {1} else {0};
|
||||
let msg = {
|
||||
let query = {
|
||||
let mut query = BytesMut::new();
|
||||
query.put_u8(2); // Specifies get_page query
|
||||
query.put_u8(latest);
|
||||
query.put_u64(lsn.0);
|
||||
page.write(&mut query).await?;
|
||||
query.freeze()
|
||||
};
|
||||
|
||||
let mut buf = BytesMut::new();
|
||||
let copy_msg = BeMessage::CopyData(&query);
|
||||
BeMessage::write(&mut buf, ©_msg)?;
|
||||
buf.freeze()
|
||||
};
|
||||
|
||||
pagestream.write(&msg).await?;
|
||||
|
||||
let response = match FeMessage::read_fut(pagestream).await? {
|
||||
Some(FeMessage::CopyData(page)) => page,
|
||||
r => panic!("Expected CopyData message, got: {:?}", r),
|
||||
};
|
||||
|
||||
let page = {
|
||||
let mut cursor = Cursor::new(response);
|
||||
let tag = AsyncReadExt::read_u8(&mut cursor).await?;
|
||||
|
||||
match tag {
|
||||
102 => {
|
||||
let mut page = Vec::<u8>::new();
|
||||
cursor.read_to_end(&mut page).await?;
|
||||
if page.len() != BYTES_IN_PAGE {
|
||||
panic!("Expected 8kb page, got: {:?}", page.len());
|
||||
}
|
||||
page
|
||||
},
|
||||
103 => {
|
||||
let mut bytes = Vec::<u8>::new();
|
||||
cursor.read_to_end(&mut bytes).await?;
|
||||
let message = String::from_utf8(bytes)?;
|
||||
panic!("Got error message: {}", message);
|
||||
},
|
||||
_ => panic!("Unhandled tag {:?}", tag)
|
||||
}
|
||||
};
|
||||
|
||||
Ok(page)
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let arg_matches = App::new("LALALA")
|
||||
.about("lalala")
|
||||
.version(GIT_VERSION)
|
||||
.arg(
|
||||
Arg::new("wal_metadata_file")
|
||||
.help("Path to wal metadata file")
|
||||
.required(true)
|
||||
.index(1),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("tenant_hex")
|
||||
.help("TODO")
|
||||
.required(true)
|
||||
.index(2),
|
||||
)
|
||||
.arg(
|
||||
Arg::new("timeline")
|
||||
.help("TODO")
|
||||
.required(true)
|
||||
.index(3),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
let metadata_file = arg_matches.value_of("wal_metadata_file").unwrap();
|
||||
let tenant_hex = arg_matches.value_of("tenant_hex").unwrap();
|
||||
let timeline = arg_matches.value_of("timeline").unwrap();
|
||||
|
||||
// Parse log lines
|
||||
let wal_metadata: Vec<WalEntryMetadata> = read_lines_buffered(metadata_file)
|
||||
.map(|line| serde_json::from_str(&line).expect("corrupt metadata file"))
|
||||
.collect();
|
||||
|
||||
// Get raw TCP connection to the pageserver postgres protocol port
|
||||
let mut socket = tokio::net::TcpStream::connect("localhost:15000").await?;
|
||||
let (client, conn) = tokio_postgres::Config::new()
|
||||
.host("127.0.0.1")
|
||||
.port(15000)
|
||||
.dbname("postgres")
|
||||
.user("zenith_admin")
|
||||
.connect_raw(&mut socket, tokio_postgres::NoTls)
|
||||
.await?;
|
||||
|
||||
// Enter pagestream protocol
|
||||
let init_query = format!("pagestream {} {}", tenant_hex, timeline);
|
||||
tokio::select! {
|
||||
_ = conn => panic!("AAAA"),
|
||||
_ = client.query(init_query.as_str(), &[]) => (),
|
||||
};
|
||||
|
||||
// Derive some variables
|
||||
let total_wal_size: usize = wal_metadata.iter().map(|m| m.size).sum();
|
||||
let affected_pages: HashSet<_> = wal_metadata.iter().map(|m| m.affected_pages.clone())
|
||||
.flatten().collect();
|
||||
let latest_lsn = wal_metadata.iter().map(|m| m.lsn).max().unwrap();
|
||||
|
||||
// Get all latest pages
|
||||
let mut durations: Vec<Duration> = vec![];
|
||||
for page in &affected_pages {
|
||||
let start = Instant::now();
|
||||
let _page_bytes = get_page(&mut socket, &latest_lsn, &page, true).await?;
|
||||
let duration = start.elapsed();
|
||||
|
||||
durations.push(duration);
|
||||
}
|
||||
|
||||
durations.sort();
|
||||
// Results are a space separated table of "metric_name value unit", for ease of parsing
|
||||
println!("test_param num_pages {}", affected_pages.len());
|
||||
println!("test_param num_wal_entries {}", wal_metadata.len());
|
||||
println!("test_param total_wal_size {} bytes", total_wal_size);
|
||||
println!("lower_is_better fastest {:?} microseconds", durations.first().unwrap().as_micros());
|
||||
println!("lower_is_better median {:?} microseconds", durations[durations.len() / 2].as_micros());
|
||||
println!("lower_is_better p99 {:?} microseconds", durations[durations.len() - 1 - durations.len() / 100].as_micros());
|
||||
println!("lower_is_better slowest {:?} microseconds", durations.last().unwrap().as_micros());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -6,9 +6,8 @@ use clap::{App, Arg};
|
||||
use pageserver::layered_repository::metadata::TimelineMetadata;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use utils::{lsn::Lsn, project_git_version};
|
||||
|
||||
project_git_version!(GIT_VERSION);
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::GIT_VERSION;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let arg_matches = App::new("Zenith update metadata utility")
|
||||
|
||||
@@ -4,26 +4,22 @@
|
||||
//! file, or on the command line.
|
||||
//! See also `settings.md` for better description on every parameter.
|
||||
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use remote_storage::{RemoteStorageConfig, RemoteStorageKind, S3Config};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use zenith_utils::postgres_backend::AuthType;
|
||||
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};
|
||||
|
||||
use std::convert::TryInto;
|
||||
use std::env;
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use toml_edit;
|
||||
use toml_edit::{Document, Item};
|
||||
use url::Url;
|
||||
use utils::{
|
||||
postgres_backend::AuthType,
|
||||
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
use crate::layered_repository::TIMELINES_SEGMENT_NAME;
|
||||
use crate::tenant_config::{TenantConf, TenantConfOpt};
|
||||
|
||||
pub mod defaults {
|
||||
use crate::tenant_config::defaults::*;
|
||||
use const_format::formatcp;
|
||||
|
||||
pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
|
||||
@@ -31,10 +27,27 @@ pub mod defaults {
|
||||
pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
|
||||
pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
|
||||
|
||||
// FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
|
||||
// would be more appropriate. But a low value forces the code to be exercised more,
|
||||
// which is good for now to trigger bugs.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024;
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024;
|
||||
|
||||
pub const DEFAULT_COMPACTION_PERIOD: &str = "1 s";
|
||||
|
||||
pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
|
||||
pub const DEFAULT_GC_PERIOD: &str = "100 s";
|
||||
|
||||
pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "zenith_admin";
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC: usize = 10;
|
||||
pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
@@ -49,6 +62,13 @@ pub mod defaults {
|
||||
#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}'
|
||||
#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}'
|
||||
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
|
||||
#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
|
||||
#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
|
||||
|
||||
@@ -57,17 +77,6 @@ pub mod defaults {
|
||||
# initial superuser role name to use when creating a new tenant
|
||||
#initial_superuser_name = '{DEFAULT_SUPERUSER}'
|
||||
|
||||
# [tenant_config]
|
||||
#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
|
||||
#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes
|
||||
#compaction_period = '{DEFAULT_COMPACTION_PERIOD}'
|
||||
#compaction_threshold = '{DEFAULT_COMPACTION_THRESHOLD}'
|
||||
|
||||
#gc_period = '{DEFAULT_GC_PERIOD}'
|
||||
#gc_horizon = {DEFAULT_GC_HORIZON}
|
||||
#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD}
|
||||
#pitr_interval = '{DEFAULT_PITR_INTERVAL}'
|
||||
|
||||
# [remote_storage]
|
||||
|
||||
"###
|
||||
@@ -78,13 +87,29 @@ pub mod defaults {
|
||||
pub struct PageServerConf {
|
||||
// Identifier of that particular pageserver so e g safekeepers
|
||||
// can safely distinguish different pageservers
|
||||
pub id: NodeId,
|
||||
pub id: ZNodeId,
|
||||
|
||||
/// Example (default): 127.0.0.1:64000
|
||||
pub listen_pg_addr: String,
|
||||
/// Example (default): 127.0.0.1:9898
|
||||
pub listen_http_addr: String,
|
||||
|
||||
// Flush out an inmemory layer, if it's holding WAL older than this
|
||||
// This puts a backstop on how much WAL needs to be re-digested if the
|
||||
// page server crashes.
|
||||
// This parameter actually determines L0 layer file size.
|
||||
pub checkpoint_distance: u64,
|
||||
|
||||
// Target file size, when creating image and delta layers.
|
||||
// This parameter determines L1 layer file size.
|
||||
pub compaction_target_size: u64,
|
||||
|
||||
// How often to check if there's compaction work to be done.
|
||||
pub compaction_period: Duration,
|
||||
|
||||
pub gc_horizon: u64,
|
||||
pub gc_period: Duration,
|
||||
|
||||
// Timeout when waiting for WAL receiver to catch up to an LSN given in a GetPage@LSN call.
|
||||
pub wait_lsn_timeout: Duration,
|
||||
// How long to wait for WAL redo to complete.
|
||||
@@ -109,35 +134,6 @@ pub struct PageServerConf {
|
||||
|
||||
pub auth_validation_public_key_path: Option<PathBuf>,
|
||||
pub remote_storage_config: Option<RemoteStorageConfig>,
|
||||
|
||||
pub profiling: ProfilingConfig,
|
||||
pub default_tenant_conf: TenantConf,
|
||||
|
||||
/// A prefix to add in etcd brokers before every key.
|
||||
/// Can be used for isolating different pageserver groups withing the same etcd cluster.
|
||||
pub broker_etcd_prefix: String,
|
||||
|
||||
/// Etcd broker endpoints to connect to.
|
||||
pub broker_endpoints: Vec<Url>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum ProfilingConfig {
|
||||
Disabled,
|
||||
PageRequests,
|
||||
}
|
||||
|
||||
impl FromStr for ProfilingConfig {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> Result<ProfilingConfig, Self::Err> {
|
||||
let result = match s {
|
||||
"disabled" => ProfilingConfig::Disabled,
|
||||
"page_requests" => ProfilingConfig::PageRequests,
|
||||
_ => bail!("invalid value \"{s}\" for profiling option, valid values are \"disabled\" and \"page_requests\""),
|
||||
};
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
// use dedicated enum for builder to better indicate the intention
|
||||
@@ -162,6 +158,14 @@ struct PageServerConfigBuilder {
|
||||
|
||||
listen_http_addr: BuilderValue<String>,
|
||||
|
||||
checkpoint_distance: BuilderValue<u64>,
|
||||
|
||||
compaction_target_size: BuilderValue<u64>,
|
||||
compaction_period: BuilderValue<Duration>,
|
||||
|
||||
gc_horizon: BuilderValue<u64>,
|
||||
gc_period: BuilderValue<Duration>,
|
||||
|
||||
wait_lsn_timeout: BuilderValue<Duration>,
|
||||
wal_redo_timeout: BuilderValue<Duration>,
|
||||
|
||||
@@ -180,11 +184,7 @@ struct PageServerConfigBuilder {
|
||||
auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
|
||||
remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,
|
||||
|
||||
id: BuilderValue<NodeId>,
|
||||
|
||||
profiling: BuilderValue<ProfilingConfig>,
|
||||
broker_etcd_prefix: BuilderValue<String>,
|
||||
broker_endpoints: BuilderValue<Vec<Url>>,
|
||||
id: BuilderValue<ZNodeId>,
|
||||
}
|
||||
|
||||
impl Default for PageServerConfigBuilder {
|
||||
@@ -194,6 +194,13 @@ impl Default for PageServerConfigBuilder {
|
||||
Self {
|
||||
listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()),
|
||||
listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()),
|
||||
checkpoint_distance: Set(DEFAULT_CHECKPOINT_DISTANCE),
|
||||
compaction_target_size: Set(DEFAULT_COMPACTION_TARGET_SIZE),
|
||||
compaction_period: Set(humantime::parse_duration(DEFAULT_COMPACTION_PERIOD)
|
||||
.expect("cannot parse default compaction period")),
|
||||
gc_horizon: Set(DEFAULT_GC_HORIZON),
|
||||
gc_period: Set(humantime::parse_duration(DEFAULT_GC_PERIOD)
|
||||
.expect("cannot parse default gc period")),
|
||||
wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT)
|
||||
.expect("cannot parse default wait lsn timeout")),
|
||||
wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
@@ -209,9 +216,6 @@ impl Default for PageServerConfigBuilder {
|
||||
auth_validation_public_key_path: Set(None),
|
||||
remote_storage_config: Set(None),
|
||||
id: NotSet,
|
||||
profiling: Set(ProfilingConfig::Disabled),
|
||||
broker_etcd_prefix: Set(etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string()),
|
||||
broker_endpoints: Set(Vec::new()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,6 +229,26 @@ impl PageServerConfigBuilder {
|
||||
self.listen_http_addr = BuilderValue::Set(listen_http_addr)
|
||||
}
|
||||
|
||||
pub fn checkpoint_distance(&mut self, checkpoint_distance: u64) {
|
||||
self.checkpoint_distance = BuilderValue::Set(checkpoint_distance)
|
||||
}
|
||||
|
||||
pub fn compaction_target_size(&mut self, compaction_target_size: u64) {
|
||||
self.compaction_target_size = BuilderValue::Set(compaction_target_size)
|
||||
}
|
||||
|
||||
pub fn compaction_period(&mut self, compaction_period: Duration) {
|
||||
self.compaction_period = BuilderValue::Set(compaction_period)
|
||||
}
|
||||
|
||||
pub fn gc_horizon(&mut self, gc_horizon: u64) {
|
||||
self.gc_horizon = BuilderValue::Set(gc_horizon)
|
||||
}
|
||||
|
||||
pub fn gc_period(&mut self, gc_period: Duration) {
|
||||
self.gc_period = BuilderValue::Set(gc_period)
|
||||
}
|
||||
|
||||
pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) {
|
||||
self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout)
|
||||
}
|
||||
@@ -268,70 +292,117 @@ impl PageServerConfigBuilder {
|
||||
self.remote_storage_config = BuilderValue::Set(remote_storage_config)
|
||||
}
|
||||
|
||||
pub fn broker_endpoints(&mut self, broker_endpoints: Vec<Url>) {
|
||||
self.broker_endpoints = BuilderValue::Set(broker_endpoints)
|
||||
}
|
||||
|
||||
pub fn broker_etcd_prefix(&mut self, broker_etcd_prefix: String) {
|
||||
self.broker_etcd_prefix = BuilderValue::Set(broker_etcd_prefix)
|
||||
}
|
||||
|
||||
pub fn id(&mut self, node_id: NodeId) {
|
||||
pub fn id(&mut self, node_id: ZNodeId) {
|
||||
self.id = BuilderValue::Set(node_id)
|
||||
}
|
||||
|
||||
pub fn profiling(&mut self, profiling: ProfilingConfig) {
|
||||
self.profiling = BuilderValue::Set(profiling)
|
||||
}
|
||||
|
||||
pub fn build(self) -> anyhow::Result<PageServerConf> {
|
||||
let broker_endpoints = self
|
||||
.broker_endpoints
|
||||
.ok_or(anyhow!("No broker endpoints provided"))?;
|
||||
|
||||
pub fn build(self) -> Result<PageServerConf> {
|
||||
Ok(PageServerConf {
|
||||
listen_pg_addr: self
|
||||
.listen_pg_addr
|
||||
.ok_or(anyhow!("missing listen_pg_addr"))?,
|
||||
.ok_or(anyhow::anyhow!("missing listen_pg_addr"))?,
|
||||
listen_http_addr: self
|
||||
.listen_http_addr
|
||||
.ok_or(anyhow!("missing listen_http_addr"))?,
|
||||
.ok_or(anyhow::anyhow!("missing listen_http_addr"))?,
|
||||
checkpoint_distance: self
|
||||
.checkpoint_distance
|
||||
.ok_or(anyhow::anyhow!("missing checkpoint_distance"))?,
|
||||
compaction_target_size: self
|
||||
.compaction_target_size
|
||||
.ok_or(anyhow::anyhow!("missing compaction_target_size"))?,
|
||||
compaction_period: self
|
||||
.compaction_period
|
||||
.ok_or(anyhow::anyhow!("missing compaction_period"))?,
|
||||
gc_horizon: self
|
||||
.gc_horizon
|
||||
.ok_or(anyhow::anyhow!("missing gc_horizon"))?,
|
||||
gc_period: self.gc_period.ok_or(anyhow::anyhow!("missing gc_period"))?,
|
||||
wait_lsn_timeout: self
|
||||
.wait_lsn_timeout
|
||||
.ok_or(anyhow!("missing wait_lsn_timeout"))?,
|
||||
.ok_or(anyhow::anyhow!("missing wait_lsn_timeout"))?,
|
||||
wal_redo_timeout: self
|
||||
.wal_redo_timeout
|
||||
.ok_or(anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow!("missing superuser"))?,
|
||||
.ok_or(anyhow::anyhow!("missing wal_redo_timeout"))?,
|
||||
superuser: self.superuser.ok_or(anyhow::anyhow!("missing superuser"))?,
|
||||
page_cache_size: self
|
||||
.page_cache_size
|
||||
.ok_or(anyhow!("missing page_cache_size"))?,
|
||||
.ok_or(anyhow::anyhow!("missing page_cache_size"))?,
|
||||
max_file_descriptors: self
|
||||
.max_file_descriptors
|
||||
.ok_or(anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow!("missing workdir"))?,
|
||||
.ok_or(anyhow::anyhow!("missing max_file_descriptors"))?,
|
||||
workdir: self.workdir.ok_or(anyhow::anyhow!("missing workdir"))?,
|
||||
pg_distrib_dir: self
|
||||
.pg_distrib_dir
|
||||
.ok_or(anyhow!("missing pg_distrib_dir"))?,
|
||||
auth_type: self.auth_type.ok_or(anyhow!("missing auth_type"))?,
|
||||
.ok_or(anyhow::anyhow!("missing pg_distrib_dir"))?,
|
||||
auth_type: self.auth_type.ok_or(anyhow::anyhow!("missing auth_type"))?,
|
||||
auth_validation_public_key_path: self
|
||||
.auth_validation_public_key_path
|
||||
.ok_or(anyhow!("missing auth_validation_public_key_path"))?,
|
||||
.ok_or(anyhow::anyhow!("missing auth_validation_public_key_path"))?,
|
||||
remote_storage_config: self
|
||||
.remote_storage_config
|
||||
.ok_or(anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow!("missing id"))?,
|
||||
profiling: self.profiling.ok_or(anyhow!("missing profiling"))?,
|
||||
// TenantConf is handled separately
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints,
|
||||
broker_etcd_prefix: self
|
||||
.broker_etcd_prefix
|
||||
.ok_or(anyhow!("missing broker_etcd_prefix"))?,
|
||||
.ok_or(anyhow::anyhow!("missing remote_storage_config"))?,
|
||||
id: self.id.ok_or(anyhow::anyhow!("missing id"))?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// External backup storage configuration, enough for creating a client for that storage.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct RemoteStorageConfig {
|
||||
/// Max allowed number of concurrent sync operations between pageserver and the remote storage.
|
||||
pub max_concurrent_sync: NonZeroUsize,
|
||||
/// Max allowed errors before the sync task is considered failed and evicted.
|
||||
pub max_sync_errors: NonZeroU32,
|
||||
/// The storage connection configuration.
|
||||
pub storage: RemoteStorageKind,
|
||||
}
|
||||
|
||||
/// A kind of a remote storage to connect to, with its connection configuration.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum RemoteStorageKind {
|
||||
/// Storage based on local file system.
|
||||
/// Specify a root folder to place all stored files into.
|
||||
LocalFs(PathBuf),
|
||||
/// AWS S3 based storage, storing all files in the S3 bucket
|
||||
/// specified by the config
|
||||
AwsS3(S3Config),
|
||||
}
|
||||
|
||||
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct S3Config {
|
||||
/// Name of the bucket to connect to.
|
||||
pub bucket_name: String,
|
||||
/// The region where the bucket is located at.
|
||||
pub bucket_region: String,
|
||||
/// A "subfolder" in the bucket, to use the same bucket separately by multiple pageservers at once.
|
||||
pub prefix_in_bucket: Option<String>,
|
||||
/// "Login" to use when connecting to bucket.
|
||||
/// Can be empty for cases like AWS k8s IAM
|
||||
/// where we can allow certain pods to connect
|
||||
/// to the bucket directly without any credentials.
|
||||
pub access_key_id: Option<String>,
|
||||
/// "Password" to use when connecting to bucket.
|
||||
pub secret_access_key: Option<String>,
|
||||
/// A base URL to send S3 requests to.
|
||||
/// By default, the endpoint is derived from a region name, assuming it's
|
||||
/// an AWS S3 region name, erroring on wrong region name.
|
||||
/// Endpoint provides a way to support other S3 flavors and their regions.
|
||||
///
|
||||
/// Example: `http://127.0.0.1:5000`
|
||||
pub endpoint: Option<String>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for S3Config {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("S3Config")
|
||||
.field("bucket_name", &self.bucket_name)
|
||||
.field("bucket_region", &self.bucket_region)
|
||||
.field("prefix_in_bucket", &self.prefix_in_bucket)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServerConf {
|
||||
//
|
||||
// Repository paths, relative to workdir.
|
||||
@@ -369,16 +440,21 @@ impl PageServerConf {
|
||||
/// validating the input and failing on errors.
|
||||
///
|
||||
/// This leaves any options not present in the file in the built-in defaults.
|
||||
pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result<Self> {
|
||||
pub fn parse_and_validate(toml: &Document, workdir: &Path) -> Result<Self> {
|
||||
let mut builder = PageServerConfigBuilder::default();
|
||||
builder.workdir(workdir.to_owned());
|
||||
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
|
||||
for (key, item) in toml.iter() {
|
||||
match key {
|
||||
"listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?),
|
||||
"listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?),
|
||||
"checkpoint_distance" => builder.checkpoint_distance(parse_toml_u64(key, item)?),
|
||||
"compaction_target_size" => {
|
||||
builder.compaction_target_size(parse_toml_u64(key, item)?)
|
||||
}
|
||||
"compaction_period" => builder.compaction_period(parse_toml_duration(key, item)?),
|
||||
"gc_horizon" => builder.gc_horizon(parse_toml_u64(key, item)?),
|
||||
"gc_period" => builder.gc_period(parse_toml_duration(key, item)?),
|
||||
"wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?),
|
||||
"wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?),
|
||||
"initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?),
|
||||
@@ -392,27 +468,12 @@ impl PageServerConf {
|
||||
"auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
|
||||
PathBuf::from(parse_toml_string(key, item)?),
|
||||
)),
|
||||
"auth_type" => builder.auth_type(parse_toml_from_str(key, item)?),
|
||||
"auth_type" => builder.auth_type(parse_toml_auth_type(key, item)?),
|
||||
"remote_storage" => {
|
||||
builder.remote_storage_config(Some(Self::parse_remote_storage_config(item)?))
|
||||
}
|
||||
"tenant_config" => {
|
||||
t_conf = Self::parse_toml_tenant_conf(item)?;
|
||||
}
|
||||
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
|
||||
"profiling" => builder.profiling(parse_toml_from_str(key, item)?),
|
||||
"broker_etcd_prefix" => builder.broker_etcd_prefix(parse_toml_string(key, item)?),
|
||||
"broker_endpoints" => builder.broker_endpoints(
|
||||
parse_toml_array(key, item)?
|
||||
.into_iter()
|
||||
.map(|endpoint_str| {
|
||||
endpoint_str.parse::<Url>().with_context(|| {
|
||||
format!("Array item {endpoint_str} for key {key} is not a valid url endpoint")
|
||||
})
|
||||
})
|
||||
.collect::<anyhow::Result<_>>()?,
|
||||
),
|
||||
_ => bail!("unrecognized pageserver option '{key}'"),
|
||||
"id" => builder.id(ZNodeId(parse_toml_u64(key, item)?)),
|
||||
_ => bail!("unrecognized pageserver option '{}'", key),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -438,75 +499,41 @@ impl PageServerConf {
|
||||
);
|
||||
}
|
||||
|
||||
conf.default_tenant_conf = t_conf.merge(TenantConf::default());
|
||||
|
||||
Ok(conf)
|
||||
}
|
||||
|
||||
// subroutine of parse_and_validate to parse `[tenant_conf]` section
|
||||
|
||||
pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
|
||||
let mut t_conf: TenantConfOpt = Default::default();
|
||||
if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
|
||||
t_conf.checkpoint_distance =
|
||||
Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_target_size) = item.get("compaction_target_size") {
|
||||
t_conf.compaction_target_size = Some(parse_toml_u64(
|
||||
"compaction_target_size",
|
||||
compaction_target_size,
|
||||
)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_period) = item.get("compaction_period") {
|
||||
t_conf.compaction_period =
|
||||
Some(parse_toml_duration("compaction_period", compaction_period)?);
|
||||
}
|
||||
|
||||
if let Some(compaction_threshold) = item.get("compaction_threshold") {
|
||||
t_conf.compaction_threshold =
|
||||
Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
|
||||
}
|
||||
|
||||
if let Some(gc_horizon) = item.get("gc_horizon") {
|
||||
t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
|
||||
}
|
||||
|
||||
if let Some(gc_period) = item.get("gc_period") {
|
||||
t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
|
||||
}
|
||||
|
||||
if let Some(pitr_interval) = item.get("pitr_interval") {
|
||||
t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
|
||||
}
|
||||
|
||||
Ok(t_conf)
|
||||
}
|
||||
|
||||
/// subroutine of parse_config(), to parse the `[remote_storage]` table.
|
||||
fn parse_remote_storage_config(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
|
||||
let local_path = toml.get("local_path");
|
||||
let bucket_name = toml.get("bucket_name");
|
||||
let bucket_region = toml.get("bucket_region");
|
||||
|
||||
let max_concurrent_syncs = NonZeroUsize::new(
|
||||
parse_optional_integer("max_concurrent_syncs", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
|
||||
)
|
||||
.context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
|
||||
|
||||
let max_sync_errors = NonZeroU32::new(
|
||||
parse_optional_integer("max_sync_errors", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
|
||||
)
|
||||
.context("Failed to parse 'max_sync_errors' as a positive integer")?;
|
||||
|
||||
let concurrency_limit = NonZeroUsize::new(
|
||||
parse_optional_integer("concurrency_limit", toml)?
|
||||
.unwrap_or(remote_storage::DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT),
|
||||
)
|
||||
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
|
||||
let max_concurrent_sync: NonZeroUsize = if let Some(s) = toml.get("max_concurrent_sync") {
|
||||
parse_toml_u64("max_concurrent_sync", s)
|
||||
.and_then(|toml_u64| {
|
||||
toml_u64.try_into().with_context(|| {
|
||||
format!("'max_concurrent_sync' value {} is too large", toml_u64)
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.and_then(NonZeroUsize::new)
|
||||
.context("'max_concurrent_sync' must be a non-zero positive integer")?
|
||||
} else {
|
||||
NonZeroUsize::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC).unwrap()
|
||||
};
|
||||
let max_sync_errors: NonZeroU32 = if let Some(s) = toml.get("max_sync_errors") {
|
||||
parse_toml_u64("max_sync_errors", s)
|
||||
.and_then(|toml_u64| {
|
||||
toml_u64.try_into().with_context(|| {
|
||||
format!("'max_sync_errors' value {} is too large", toml_u64)
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.and_then(NonZeroU32::new)
|
||||
.context("'max_sync_errors' must be a non-zero positive integer")?
|
||||
} else {
|
||||
NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS).unwrap()
|
||||
};
|
||||
|
||||
let storage = match (local_path, bucket_name, bucket_region) {
|
||||
(None, None, None) => bail!("no 'local_path' nor 'bucket_name' option"),
|
||||
@@ -519,6 +546,16 @@ impl PageServerConf {
|
||||
(None, Some(bucket_name), Some(bucket_region)) => RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
|
||||
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
|
||||
access_key_id: toml
|
||||
.get("access_key_id")
|
||||
.map(|access_key_id| parse_toml_string("access_key_id", access_key_id))
|
||||
.transpose()?,
|
||||
secret_access_key: toml
|
||||
.get("secret_access_key")
|
||||
.map(|secret_access_key| {
|
||||
parse_toml_string("secret_access_key", secret_access_key)
|
||||
})
|
||||
.transpose()?,
|
||||
prefix_in_bucket: toml
|
||||
.get("prefix_in_bucket")
|
||||
.map(|prefix_in_bucket| parse_toml_string("prefix_in_bucket", prefix_in_bucket))
|
||||
@@ -527,7 +564,6 @@ impl PageServerConf {
|
||||
.get("endpoint")
|
||||
.map(|endpoint| parse_toml_string("endpoint", endpoint))
|
||||
.transpose()?,
|
||||
concurrency_limit,
|
||||
}),
|
||||
(Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
|
||||
parse_toml_string("local_path", local_path)?,
|
||||
@@ -536,7 +572,7 @@ impl PageServerConf {
|
||||
};
|
||||
|
||||
Ok(RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage,
|
||||
})
|
||||
@@ -544,13 +580,18 @@ impl PageServerConf {
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn test_repo_dir(test_name: &str) -> PathBuf {
|
||||
PathBuf::from(format!("../tmp_check/test_{test_name}"))
|
||||
PathBuf::from(format!("../tmp_check/test_{}", test_name))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn dummy_conf(repo_dir: PathBuf) -> Self {
|
||||
PageServerConf {
|
||||
id: NodeId(0),
|
||||
id: ZNodeId(0),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: 4 * 1024 * 1024,
|
||||
compaction_period: Duration::from_secs(10),
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: Duration::from_secs(10),
|
||||
wait_lsn_timeout: Duration::from_secs(60),
|
||||
wal_redo_timeout: Duration::from_secs(60),
|
||||
page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE,
|
||||
@@ -563,10 +604,6 @@ impl PageServerConf {
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::dummy_conf(),
|
||||
broker_endpoints: Vec::new(),
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -576,7 +613,7 @@ impl PageServerConf {
|
||||
fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
|
||||
let s = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
Ok(s.to_string())
|
||||
}
|
||||
|
||||
@@ -585,68 +622,26 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
|
||||
// for our use, though.
|
||||
let i: i64 = item
|
||||
.as_integer()
|
||||
.with_context(|| format!("configure option {name} is not an integer"))?;
|
||||
.with_context(|| format!("configure option {} is not an integer", name))?;
|
||||
if i < 0 {
|
||||
bail!("configure option {name} cannot be negative");
|
||||
bail!("configure option {} cannot be negative", name);
|
||||
}
|
||||
Ok(i as u64)
|
||||
}
|
||||
|
||||
fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
|
||||
where
|
||||
I: TryFrom<i64, Error = E>,
|
||||
E: std::error::Error + Send + Sync + 'static,
|
||||
{
|
||||
let toml_integer = match item.get(name) {
|
||||
Some(item) => item
|
||||
.as_integer()
|
||||
.with_context(|| format!("configure option {name} is not an integer"))?,
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
I::try_from(toml_integer)
|
||||
.map(Some)
|
||||
.with_context(|| format!("configure option {name} is too large"))
|
||||
}
|
||||
|
||||
fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
|
||||
let s = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
|
||||
Ok(humantime::parse_duration(s)?)
|
||||
}
|
||||
|
||||
fn parse_toml_from_str<T>(name: &str, item: &Item) -> anyhow::Result<T>
|
||||
where
|
||||
T: FromStr,
|
||||
<T as FromStr>::Err: std::fmt::Display,
|
||||
{
|
||||
fn parse_toml_auth_type(name: &str, item: &Item) -> Result<AuthType> {
|
||||
let v = item
|
||||
.as_str()
|
||||
.with_context(|| format!("configure option {name} is not a string"))?;
|
||||
T::from_str(v).map_err(|e| {
|
||||
anyhow!(
|
||||
"Failed to parse string as {parse_type} for configure option {name}: {e}",
|
||||
parse_type = stringify!(T)
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
fn parse_toml_array(name: &str, item: &Item) -> anyhow::Result<Vec<String>> {
|
||||
let array = item
|
||||
.as_array()
|
||||
.with_context(|| format!("configure option {name} is not an array"))?;
|
||||
|
||||
array
|
||||
.iter()
|
||||
.map(|value| {
|
||||
value
|
||||
.as_str()
|
||||
.map(str::to_string)
|
||||
.with_context(|| format!("Array item {value:?} for key {name} is not a string"))
|
||||
})
|
||||
.collect()
|
||||
.with_context(|| format!("configure option {} is not a string", name))?;
|
||||
AuthType::from_str(v)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -663,6 +658,14 @@ mod tests {
|
||||
listen_pg_addr = '127.0.0.1:64000'
|
||||
listen_http_addr = '127.0.0.1:9898'
|
||||
|
||||
checkpoint_distance = 111 # in bytes
|
||||
|
||||
compaction_target_size = 111 # in bytes
|
||||
compaction_period = '111 s'
|
||||
|
||||
gc_period = '222 s'
|
||||
gc_horizon = 222
|
||||
|
||||
wait_lsn_timeout = '111 s'
|
||||
wal_redo_timeout = '111 s'
|
||||
|
||||
@@ -679,23 +682,26 @@ id = 10
|
||||
fn parse_defaults() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
// we have to create dummy values to overcome the validation errors
|
||||
let config_string = format!(
|
||||
"pg_distrib_dir='{}'\nid=10\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
// we have to create dummy pathes to overcome the validation errors
|
||||
let config_string = format!("pg_distrib_dir='{}'\nid=10", pg_distrib_dir.display());
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
|
||||
let parsed_config =
|
||||
PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
parsed_config,
|
||||
PageServerConf {
|
||||
id: NodeId(10),
|
||||
id: ZNodeId(10),
|
||||
listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(),
|
||||
listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(),
|
||||
checkpoint_distance: defaults::DEFAULT_CHECKPOINT_DISTANCE,
|
||||
compaction_target_size: defaults::DEFAULT_COMPACTION_TARGET_SIZE,
|
||||
compaction_period: humantime::parse_duration(defaults::DEFAULT_COMPACTION_PERIOD)?,
|
||||
gc_horizon: defaults::DEFAULT_GC_HORIZON,
|
||||
gc_period: humantime::parse_duration(defaults::DEFAULT_GC_PERIOD)?,
|
||||
wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?,
|
||||
wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?,
|
||||
superuser: defaults::DEFAULT_SUPERUSER.to_string(),
|
||||
@@ -706,12 +712,6 @@ id = 10
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
},
|
||||
"Correct defaults should be used when no config values are provided"
|
||||
);
|
||||
@@ -723,23 +723,30 @@ id = 10
|
||||
fn parse_basic_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
|
||||
let config_string = format!(
|
||||
"{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoints = ['{broker_endpoint}']",
|
||||
"{}pg_distrib_dir='{}'",
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display()
|
||||
);
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
|
||||
let parsed_config =
|
||||
PageServerConf::parse_and_validate(&toml, &workdir).unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
});
|
||||
|
||||
assert_eq!(
|
||||
parsed_config,
|
||||
PageServerConf {
|
||||
id: NodeId(10),
|
||||
id: ZNodeId(10),
|
||||
listen_pg_addr: "127.0.0.1:64000".to_string(),
|
||||
listen_http_addr: "127.0.0.1:9898".to_string(),
|
||||
checkpoint_distance: 111,
|
||||
compaction_target_size: 111,
|
||||
compaction_period: Duration::from_secs(111),
|
||||
gc_horizon: 222,
|
||||
gc_period: Duration::from_secs(222),
|
||||
wait_lsn_timeout: Duration::from_secs(111),
|
||||
wal_redo_timeout: Duration::from_secs(111),
|
||||
superuser: "zzzz".to_string(),
|
||||
@@ -750,12 +757,6 @@ id = 10
|
||||
auth_type: AuthType::Trust,
|
||||
auth_validation_public_key_path: None,
|
||||
remote_storage_config: None,
|
||||
profiling: ProfilingConfig::Disabled,
|
||||
default_tenant_conf: TenantConf::default(),
|
||||
broker_endpoints: vec![broker_endpoint
|
||||
.parse()
|
||||
.expect("Failed to parse a valid broker endpoint URL")],
|
||||
broker_etcd_prefix: etcd_broker::DEFAULT_NEON_BROKER_ETCD_PREFIX.to_string(),
|
||||
},
|
||||
"Should be able to parse all basic config values correctly"
|
||||
);
|
||||
@@ -767,7 +768,6 @@ id = 10
|
||||
fn parse_remote_fs_storage_config() -> anyhow::Result<()> {
|
||||
let tempdir = tempdir()?;
|
||||
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
|
||||
let local_storage_path = tempdir.path().join("local_remote_storage");
|
||||
|
||||
@@ -785,36 +785,37 @@ local_path = '{}'"#,
|
||||
|
||||
for remote_storage_config_str in identical_toml_declarations {
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
r#"{}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
{}"#,
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display(),
|
||||
remote_storage_config_str,
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for the local FS");
|
||||
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs: NonZeroUsize::new(
|
||||
remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_sync: NonZeroUsize::new(
|
||||
defaults::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNC
|
||||
)
|
||||
.unwrap(),
|
||||
max_sync_errors: NonZeroU32::new(defaults::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
|
||||
.unwrap(),
|
||||
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
|
||||
},
|
||||
"Remote storage config should correctly parse the local FS config and fill other storage defaults"
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -827,44 +828,47 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
let bucket_name = "some-sample-bucket".to_string();
|
||||
let bucket_region = "eu-north-1".to_string();
|
||||
let prefix_in_bucket = "test_prefix".to_string();
|
||||
let access_key_id = "SOMEKEYAAAAASADSAH*#".to_string();
|
||||
let secret_access_key = "SOMEsEcReTsd292v".to_string();
|
||||
let endpoint = "http://localhost:5000".to_string();
|
||||
let max_concurrent_syncs = NonZeroUsize::new(111).unwrap();
|
||||
let max_concurrent_sync = NonZeroUsize::new(111).unwrap();
|
||||
let max_sync_errors = NonZeroU32::new(222).unwrap();
|
||||
let s3_concurrency_limit = NonZeroUsize::new(333).unwrap();
|
||||
let broker_endpoint = "http://127.0.0.1:7777";
|
||||
|
||||
let identical_toml_declarations = &[
|
||||
format!(
|
||||
r#"[remote_storage]
|
||||
max_concurrent_syncs = {max_concurrent_syncs}
|
||||
max_sync_errors = {max_sync_errors}
|
||||
bucket_name = '{bucket_name}'
|
||||
bucket_region = '{bucket_region}'
|
||||
prefix_in_bucket = '{prefix_in_bucket}'
|
||||
endpoint = '{endpoint}'
|
||||
concurrency_limit = {s3_concurrency_limit}"#
|
||||
max_concurrent_sync = {}
|
||||
max_sync_errors = {}
|
||||
bucket_name = '{}'
|
||||
bucket_region = '{}'
|
||||
prefix_in_bucket = '{}'
|
||||
access_key_id = '{}'
|
||||
secret_access_key = '{}'
|
||||
endpoint = '{}'"#,
|
||||
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
|
||||
),
|
||||
format!(
|
||||
"remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\
|
||||
bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}",
|
||||
"remote_storage={{max_concurrent_sync={}, max_sync_errors={}, bucket_name='{}', bucket_region='{}', prefix_in_bucket='{}', access_key_id='{}', secret_access_key='{}', endpoint='{}'}}",
|
||||
max_concurrent_sync, max_sync_errors, bucket_name, bucket_region, prefix_in_bucket, access_key_id, secret_access_key, endpoint
|
||||
),
|
||||
];
|
||||
|
||||
for remote_storage_config_str in identical_toml_declarations {
|
||||
let config_string = format!(
|
||||
r#"{ALL_BASE_VALUES_TOML}
|
||||
r#"{}
|
||||
pg_distrib_dir='{}'
|
||||
broker_endpoints = ['{broker_endpoint}']
|
||||
|
||||
{remote_storage_config_str}"#,
|
||||
{}"#,
|
||||
ALL_BASE_VALUES_TOML,
|
||||
pg_distrib_dir.display(),
|
||||
remote_storage_config_str,
|
||||
);
|
||||
|
||||
let toml = config_string.parse()?;
|
||||
|
||||
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to parse config '{config_string}', reason: {e:?}")
|
||||
panic!("Failed to parse config '{}', reason: {}", config_string, e)
|
||||
})
|
||||
.remote_storage_config
|
||||
.expect("Should have remote storage config for S3");
|
||||
@@ -872,14 +876,15 @@ broker_endpoints = ['{broker_endpoint}']
|
||||
assert_eq!(
|
||||
parsed_remote_storage_config,
|
||||
RemoteStorageConfig {
|
||||
max_concurrent_syncs,
|
||||
max_concurrent_sync,
|
||||
max_sync_errors,
|
||||
storage: RemoteStorageKind::AwsS3(S3Config {
|
||||
bucket_name: bucket_name.clone(),
|
||||
bucket_region: bucket_region.clone(),
|
||||
access_key_id: Some(access_key_id.clone()),
|
||||
secret_access_key: Some(secret_access_key.clone()),
|
||||
prefix_in_bucket: Some(prefix_in_bucket.clone()),
|
||||
endpoint: Some(endpoint.clone()),
|
||||
concurrency_limit: s3_concurrency_limit,
|
||||
endpoint: Some(endpoint.clone())
|
||||
}),
|
||||
},
|
||||
"Remote storage config should correctly parse the S3 config"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_with::{serde_as, DisplayFromStr};
|
||||
use utils::{
|
||||
use zenith_utils::{
|
||||
lsn::Lsn,
|
||||
zid::{NodeId, ZTenantId, ZTimelineId},
|
||||
zid::{ZNodeId, ZTenantId, ZTimelineId},
|
||||
};
|
||||
|
||||
#[serde_as]
|
||||
@@ -20,19 +20,11 @@ pub struct TimelineCreateRequest {
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize, Default)]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantCreateRequest {
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub new_tenant_id: Option<ZTenantId>,
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
pub pitr_interval: Option<String>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
@@ -42,46 +34,5 @@ pub struct TenantCreateResponse(#[serde_as(as = "DisplayFromStr")] pub ZTenantId
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct StatusResponse {
|
||||
pub id: NodeId,
|
||||
}
|
||||
|
||||
impl TenantCreateRequest {
|
||||
pub fn new(new_tenant_id: Option<ZTenantId>) -> TenantCreateRequest {
|
||||
TenantCreateRequest {
|
||||
new_tenant_id,
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct TenantConfigRequest {
|
||||
pub tenant_id: ZTenantId,
|
||||
#[serde(default)]
|
||||
#[serde_as(as = "Option<DisplayFromStr>")]
|
||||
pub checkpoint_distance: Option<u64>,
|
||||
pub compaction_target_size: Option<u64>,
|
||||
pub compaction_period: Option<String>,
|
||||
pub compaction_threshold: Option<usize>,
|
||||
pub gc_horizon: Option<u64>,
|
||||
pub gc_period: Option<String>,
|
||||
pub image_creation_threshold: Option<usize>,
|
||||
pub pitr_interval: Option<String>,
|
||||
}
|
||||
|
||||
impl TenantConfigRequest {
|
||||
pub fn new(tenant_id: ZTenantId) -> TenantConfigRequest {
|
||||
TenantConfigRequest {
|
||||
tenant_id,
|
||||
checkpoint_distance: None,
|
||||
compaction_target_size: None,
|
||||
compaction_period: None,
|
||||
compaction_threshold: None,
|
||||
gc_horizon: None,
|
||||
gc_period: None,
|
||||
image_creation_threshold: None,
|
||||
pitr_interval: None,
|
||||
}
|
||||
}
|
||||
pub id: ZNodeId,
|
||||
}
|
||||
|
||||
@@ -123,53 +123,6 @@ paths:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/wal_receiver:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
- name: timeline_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
format: hex
|
||||
get:
|
||||
description: Get wal receiver's data attached to the timeline
|
||||
responses:
|
||||
"200":
|
||||
description: WalReceiverEntry
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/WalReceiverEntry"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"404":
|
||||
description: Error when no wal receiver is running or found
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
/v1/tenant/{tenant_id}/timeline/{timeline_id}/attach:
|
||||
parameters:
|
||||
@@ -375,7 +328,11 @@ paths:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantCreateInfo"
|
||||
type: object
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
responses:
|
||||
"201":
|
||||
description: New tenant created successfully
|
||||
@@ -414,48 +371,7 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
/v1/tenant/config:
|
||||
put:
|
||||
description: |
|
||||
Update tenant's config.
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantConfigInfo"
|
||||
responses:
|
||||
"200":
|
||||
description: OK
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: array
|
||||
items:
|
||||
$ref: "#/components/schemas/TenantInfo"
|
||||
"400":
|
||||
description: Malformed tenant config request
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
"401":
|
||||
description: Unauthorized Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/UnauthorizedError"
|
||||
"403":
|
||||
description: Forbidden Error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ForbiddenError"
|
||||
"500":
|
||||
description: Generic operation error
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/Error"
|
||||
|
||||
components:
|
||||
securitySchemes:
|
||||
JWT:
|
||||
@@ -473,45 +389,6 @@ components:
|
||||
type: string
|
||||
state:
|
||||
type: string
|
||||
TenantCreateInfo:
|
||||
type: object
|
||||
properties:
|
||||
new_tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
gc_period:
|
||||
type: string
|
||||
gc_horizon:
|
||||
type: integer
|
||||
pitr_interval:
|
||||
type: string
|
||||
checkpoint_distance:
|
||||
type: integer
|
||||
compaction_period:
|
||||
type: string
|
||||
compaction_threshold:
|
||||
type: string
|
||||
TenantConfigInfo:
|
||||
type: object
|
||||
properties:
|
||||
tenant_id:
|
||||
type: string
|
||||
format: hex
|
||||
gc_period:
|
||||
type: string
|
||||
gc_horizon:
|
||||
type: integer
|
||||
pitr_interval:
|
||||
type: string
|
||||
checkpoint_distance:
|
||||
type: integer
|
||||
compaction_period:
|
||||
type: string
|
||||
compaction_threshold:
|
||||
type: string
|
||||
TimelineInfo:
|
||||
type: object
|
||||
required:
|
||||
@@ -532,7 +409,6 @@ components:
|
||||
type: object
|
||||
required:
|
||||
- awaits_download
|
||||
- remote_consistent_lsn
|
||||
properties:
|
||||
awaits_download:
|
||||
type: boolean
|
||||
@@ -567,21 +443,6 @@ components:
|
||||
type: integer
|
||||
current_logical_size_non_incremental:
|
||||
type: integer
|
||||
WalReceiverEntry:
|
||||
type: object
|
||||
required:
|
||||
- thread_id
|
||||
- wal_producer_connstr
|
||||
properties:
|
||||
thread_id:
|
||||
type: integer
|
||||
wal_producer_connstr:
|
||||
type: string
|
||||
last_received_msg_lsn:
|
||||
type: string
|
||||
format: hex
|
||||
last_received_msg_ts:
|
||||
type: integer
|
||||
|
||||
Error:
|
||||
type: object
|
||||
|
||||
@@ -1,39 +1,36 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use anyhow::Result;
|
||||
use hyper::StatusCode;
|
||||
use hyper::{Body, Request, Response, Uri};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use tracing::*;
|
||||
use zenith_utils::auth::JwtAuth;
|
||||
use zenith_utils::http::endpoint::attach_openapi_ui;
|
||||
use zenith_utils::http::endpoint::auth_middleware;
|
||||
use zenith_utils::http::endpoint::check_permission;
|
||||
use zenith_utils::http::error::ApiError;
|
||||
use zenith_utils::http::{
|
||||
endpoint,
|
||||
error::HttpErrorBody,
|
||||
json::{json_request, json_response},
|
||||
request::parse_request_param,
|
||||
};
|
||||
use zenith_utils::http::{RequestExt, RouterBuilder};
|
||||
use zenith_utils::zid::{ZTenantTimelineId, ZTimelineId};
|
||||
|
||||
use super::models::{
|
||||
StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse,
|
||||
TimelineCreateRequest,
|
||||
StatusResponse, TenantCreateRequest, TenantCreateResponse, TimelineCreateRequest,
|
||||
};
|
||||
use crate::remote_storage::{schedule_timeline_download, RemoteIndex};
|
||||
use crate::repository::Repository;
|
||||
use crate::storage_sync;
|
||||
use crate::storage_sync::index::{RemoteIndex, RemoteTimeline};
|
||||
use crate::tenant_config::TenantConfOpt;
|
||||
use crate::timelines::{LocalTimelineInfo, RemoteTimelineInfo, TimelineInfo};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines};
|
||||
use utils::{
|
||||
auth::JwtAuth,
|
||||
http::{
|
||||
endpoint::{self, attach_openapi_ui, auth_middleware, check_permission},
|
||||
error::{ApiError, HttpErrorBody},
|
||||
json::{json_request, json_response},
|
||||
request::parse_request_param,
|
||||
RequestExt, RouterBuilder,
|
||||
},
|
||||
zid::{ZTenantId, ZTenantTimelineId, ZTimelineId},
|
||||
};
|
||||
use crate::{config::PageServerConf, tenant_mgr, timelines, ZTenantId};
|
||||
|
||||
struct State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
allowlist_routes: Vec<Uri>,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
@@ -41,27 +38,17 @@ impl State {
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<Self> {
|
||||
) -> Self {
|
||||
let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
|
||||
.iter()
|
||||
.map(|v| v.parse().unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
// Note that this remote storage is created separately from the main one in the sync_loop.
|
||||
// It's fine since it's stateless and some code duplication saves us from bloating the code around with generics.
|
||||
let remote_storage = conf
|
||||
.remote_storage_config
|
||||
.as_ref()
|
||||
.map(|storage_config| GenericRemoteStorage::new(conf.workdir.clone(), storage_config))
|
||||
.transpose()
|
||||
.context("Failed to init generic remote storage")?;
|
||||
|
||||
Ok(Self {
|
||||
Self {
|
||||
conf,
|
||||
auth,
|
||||
allowlist_routes,
|
||||
remote_index,
|
||||
remote_storage,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,8 +122,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
}),
|
||||
})
|
||||
}
|
||||
@@ -166,47 +153,43 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
let include_non_incremental_logical_size = get_include_non_incremental_logical_size(&request);
|
||||
|
||||
let (local_timeline_info, remote_timeline_info) = async {
|
||||
// any error here will render local timeline as None
|
||||
// XXX .in_current_span does not attach messages in spawn_blocking future to current future's span
|
||||
let local_timeline_info = tokio::task::spawn_blocking(move || {
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
};
|
||||
Ok::<_, anyhow::Error>(local_timeline)
|
||||
})
|
||||
.await
|
||||
.ok()
|
||||
.and_then(|r| r.ok())
|
||||
.flatten();
|
||||
let span = info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id);
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.metadata.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.awaits_download,
|
||||
let (local_timeline_info, span) = tokio::task::spawn_blocking(move || {
|
||||
let entered = span.entered();
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
let local_timeline = {
|
||||
repo.get_timeline(timeline_id)
|
||||
.as_ref()
|
||||
.map(|timeline| {
|
||||
LocalTimelineInfo::from_repo_timeline(
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
timeline,
|
||||
include_non_incremental_logical_size,
|
||||
)
|
||||
})
|
||||
.transpose()?
|
||||
};
|
||||
(local_timeline_info, remote_timeline_info)
|
||||
}
|
||||
.instrument(info_span!("timeline_detail_handler", tenant = %tenant_id, timeline = %timeline_id))
|
||||
.await;
|
||||
Ok::<_, anyhow::Error>((local_timeline, entered.exit()))
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let remote_timeline_info = {
|
||||
let remote_index_read = get_state(&request).remote_index.read().await;
|
||||
remote_index_read
|
||||
.timeline_entry(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.map(|remote_entry| RemoteTimelineInfo {
|
||||
remote_consistent_lsn: remote_entry.disk_consistent_lsn(),
|
||||
awaits_download: remote_entry.get_awaits_download(),
|
||||
})
|
||||
};
|
||||
|
||||
let _enter = span.entered();
|
||||
|
||||
if local_timeline_info.is_none() && remote_timeline_info.is_none() {
|
||||
return Err(ApiError::NotFound(
|
||||
@@ -224,129 +207,44 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
|
||||
json_response(StatusCode::OK, timeline_info)
|
||||
}
|
||||
|
||||
async fn wal_receiver_get_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
|
||||
let wal_receiver = tokio::task::spawn_blocking(move || {
|
||||
let _enter =
|
||||
info_span!("wal_receiver_get", tenant = %tenant_id, timeline = %timeline_id).entered();
|
||||
|
||||
crate::walreceiver::get_wal_receiver_entry(tenant_id, timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?
|
||||
.ok_or_else(|| {
|
||||
ApiError::NotFound(format!(
|
||||
"WAL receiver not found for tenant {} and timeline {}",
|
||||
tenant_id, timeline_id
|
||||
))
|
||||
})?;
|
||||
|
||||
json_response(StatusCode::OK, wal_receiver)
|
||||
}
|
||||
|
||||
async fn timeline_attach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: ZTenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let timeline_id: ZTimelineId = parse_request_param(&request, "timeline_id")?;
|
||||
info!(
|
||||
"Handling timeline {} attach for tenant: {}",
|
||||
timeline_id, tenant_id,
|
||||
);
|
||||
let span = info_span!("timeline_attach_handler", tenant = %tenant_id, timeline = %timeline_id);
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
if tenant_mgr::get_local_timeline_with_load(tenant_id, timeline_id).is_ok() {
|
||||
let span = tokio::task::spawn_blocking(move || {
|
||||
let entered = span.entered();
|
||||
if tenant_mgr::get_timeline_for_tenant_load(tenant_id, timeline_id).is_ok() {
|
||||
// TODO: maybe answer with 309 Not Modified here?
|
||||
anyhow::bail!("Timeline is already present locally")
|
||||
};
|
||||
Ok(())
|
||||
Ok(entered.exit())
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
let sync_id = ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
};
|
||||
let state = get_state(&request);
|
||||
let remote_index = &state.remote_index;
|
||||
let mut remote_index_write = get_state(&request).remote_index.write().await;
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
if let Some(remote_timeline) = index_accessor.timeline_entry_mut(&sync_id) {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
remote_timeline.awaits_download = true;
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
return json_response(StatusCode::ACCEPTED, ());
|
||||
} else {
|
||||
// no timeline in the index, release the lock to make the potentially lengthy download opetation
|
||||
drop(index_accessor);
|
||||
}
|
||||
|
||||
let new_timeline = match try_download_index_part_data(state, sync_id).await {
|
||||
Ok(Some(mut new_timeline)) => {
|
||||
tokio::fs::create_dir_all(state.conf.timeline_path(&timeline_id, &tenant_id))
|
||||
.await
|
||||
.context("Failed to create new timeline directory")?;
|
||||
new_timeline.awaits_download = true;
|
||||
new_timeline
|
||||
}
|
||||
Ok(None) => return Err(ApiError::NotFound("Unknown remote timeline".to_string())),
|
||||
Err(e) => {
|
||||
error!("Failed to retrieve remote timeline data: {:?}", e);
|
||||
return Err(ApiError::NotFound(
|
||||
"Failed to retrieve remote timeline".to_string(),
|
||||
));
|
||||
}
|
||||
};
|
||||
|
||||
let mut index_accessor = remote_index.write().await;
|
||||
match index_accessor.timeline_entry_mut(&sync_id) {
|
||||
Some(remote_timeline) => {
|
||||
if remote_timeline.awaits_download {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
remote_timeline.awaits_download = true;
|
||||
}
|
||||
None => index_accessor.add_timeline_entry(sync_id, new_timeline),
|
||||
}
|
||||
storage_sync::schedule_layer_download(tenant_id, timeline_id);
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn try_download_index_part_data(
|
||||
state: &State,
|
||||
sync_id: ZTenantTimelineId,
|
||||
) -> anyhow::Result<Option<RemoteTimeline>> {
|
||||
let index_part = match state.remote_storage.as_ref() {
|
||||
Some(GenericRemoteStorage::Local(local_storage)) => {
|
||||
storage_sync::download_index_part(state.conf, local_storage, sync_id).await
|
||||
}
|
||||
Some(GenericRemoteStorage::S3(s3_storage)) => {
|
||||
storage_sync::download_index_part(state.conf, s3_storage, sync_id).await
|
||||
}
|
||||
None => return Ok(None),
|
||||
}
|
||||
.with_context(|| format!("Failed to download index part for timeline {sync_id}"))?;
|
||||
|
||||
let timeline_path = state
|
||||
.conf
|
||||
.timeline_path(&sync_id.timeline_id, &sync_id.tenant_id);
|
||||
RemoteTimeline::from_index_part(&timeline_path, index_part)
|
||||
.map(Some)
|
||||
.with_context(|| {
|
||||
format!("Failed to convert index part into remote timeline for timeline {sync_id}")
|
||||
let _enter = span.entered(); // entered guard cannot live across awaits (non Send)
|
||||
let index_entry = remote_index_write
|
||||
.timeline_entry_mut(&ZTenantTimelineId {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
.ok_or_else(|| ApiError::NotFound("Unknown remote timeline".to_string()))?;
|
||||
|
||||
if index_entry.get_awaits_download() {
|
||||
return Err(ApiError::Conflict(
|
||||
"Timeline download is already in progress".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
index_entry.set_awaits_download(true);
|
||||
schedule_timeline_download(tenant_id, timeline_id);
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
@@ -359,8 +257,8 @@ async fn timeline_detach_handler(request: Request<Body>) -> Result<Response<Body
|
||||
let _enter =
|
||||
info_span!("timeline_detach_handler", tenant = %tenant_id, timeline = %timeline_id)
|
||||
.entered();
|
||||
let state = get_state(&request);
|
||||
tenant_mgr::detach_timeline(state.conf, tenant_id, timeline_id)
|
||||
let repo = tenant_mgr::get_repository_for_tenant(tenant_id)?;
|
||||
repo.detach_timeline(timeline_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -377,7 +275,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
|
||||
crate::tenant_mgr::list_tenants()
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)?;
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
json_response(StatusCode::OK, response_data)
|
||||
}
|
||||
@@ -389,28 +287,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
let request_data: TenantCreateRequest = json_request(&mut request).await?;
|
||||
let remote_index = get_state(&request).remote_index.clone();
|
||||
|
||||
let mut tenant_conf = TenantConfOpt::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
let target_tenant_id = request_data
|
||||
.new_tenant_id
|
||||
.map(ZTenantId::from)
|
||||
@@ -418,9 +294,8 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
|
||||
let new_tenant_id = tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_create", tenant = ?target_tenant_id).entered();
|
||||
let conf = get_config(&request);
|
||||
|
||||
tenant_mgr::create_tenant_repository(conf, tenant_conf, target_tenant_id, remote_index)
|
||||
tenant_mgr::create_tenant_repository(get_config(&request), target_tenant_id, remote_index)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
@@ -431,45 +306,6 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
|
||||
})
|
||||
}
|
||||
|
||||
async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
let request_data: TenantConfigRequest = json_request(&mut request).await?;
|
||||
let tenant_id = request_data.tenant_id;
|
||||
// check for management permission
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let mut tenant_conf: TenantConfOpt = Default::default();
|
||||
if let Some(gc_period) = request_data.gc_period {
|
||||
tenant_conf.gc_period =
|
||||
Some(humantime::parse_duration(&gc_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
tenant_conf.gc_horizon = request_data.gc_horizon;
|
||||
tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
|
||||
|
||||
if let Some(pitr_interval) = request_data.pitr_interval {
|
||||
tenant_conf.pitr_interval =
|
||||
Some(humantime::parse_duration(&pitr_interval).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
|
||||
tenant_conf.compaction_target_size = request_data.compaction_target_size;
|
||||
tenant_conf.compaction_threshold = request_data.compaction_threshold;
|
||||
|
||||
if let Some(compaction_period) = request_data.compaction_period {
|
||||
tenant_conf.compaction_period =
|
||||
Some(humantime::parse_duration(&compaction_period).map_err(ApiError::from_err)?);
|
||||
}
|
||||
|
||||
tokio::task::spawn_blocking(move || {
|
||||
let _enter = info_span!("tenant_config", tenant = ?tenant_id).entered();
|
||||
|
||||
tenant_mgr::update_tenant_config(tenant_conf, tenant_id)
|
||||
})
|
||||
.await
|
||||
.map_err(ApiError::from_err)??;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
|
||||
json_response(
|
||||
StatusCode::NOT_FOUND,
|
||||
@@ -481,7 +317,7 @@ pub fn make_router(
|
||||
conf: &'static PageServerConf,
|
||||
auth: Option<Arc<JwtAuth>>,
|
||||
remote_index: RemoteIndex,
|
||||
) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
|
||||
) -> RouterBuilder<hyper::Body, ApiError> {
|
||||
let spec = include_bytes!("openapi_spec.yml");
|
||||
let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
|
||||
if auth.is_some() {
|
||||
@@ -495,24 +331,17 @@ pub fn make_router(
|
||||
}))
|
||||
}
|
||||
|
||||
Ok(router
|
||||
.data(Arc::new(
|
||||
State::new(conf, auth, remote_index).context("Failed to initialize router state")?,
|
||||
))
|
||||
router
|
||||
.data(Arc::new(State::new(conf, auth, remote_index)))
|
||||
.get("/v1/status", status_handler)
|
||||
.get("/v1/tenant", tenant_list_handler)
|
||||
.post("/v1/tenant", tenant_create_handler)
|
||||
.put("/v1/tenant/config", tenant_config_handler)
|
||||
.get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
|
||||
.post("/v1/tenant/:tenant_id/timeline", timeline_create_handler)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id",
|
||||
timeline_detail_handler,
|
||||
)
|
||||
.get(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/wal_receiver",
|
||||
wal_receiver_get_handler,
|
||||
)
|
||||
.post(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/attach",
|
||||
timeline_attach_handler,
|
||||
@@ -521,5 +350,5 @@ pub fn make_router(
|
||||
"/v1/tenant/:tenant_id/timeline/:timeline_id/detach",
|
||||
timeline_detach_handler,
|
||||
)
|
||||
.any(handler_404))
|
||||
.any(handler_404)
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ use postgres_ffi::waldecoder::*;
|
||||
use postgres_ffi::xlog_utils::*;
|
||||
use postgres_ffi::{pg_constants, ControlFileData, DBState_DB_SHUTDOWNED};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Import all relation data pages from local disk into the repository.
|
||||
@@ -274,7 +274,7 @@ fn import_control_file<R: Repository>(
|
||||
|
||||
// Extract the checkpoint record and import it separately.
|
||||
let pg_control = ControlFileData::decode(&buffer)?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode()?;
|
||||
let checkpoint_bytes = pg_control.checkPointCopy.encode();
|
||||
modification.put_checkpoint(checkpoint_bytes)?;
|
||||
|
||||
Ok(pg_control)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -23,7 +23,6 @@ distribution depends on the workload: the updates could be totally random, or
|
||||
there could be a long stream of updates to a single relation when data is bulk
|
||||
loaded, for example, or something in between.
|
||||
|
||||
```
|
||||
Cloud Storage Page Server Safekeeper
|
||||
L1 L0 Memory WAL
|
||||
|
||||
@@ -38,7 +37,6 @@ Cloud Storage Page Server Safekeeper
|
||||
+----+----+ +----+----+ | | |
|
||||
|EEEE| |EEEE|EEEE| +---+-----+
|
||||
+----+ +----+----+
|
||||
```
|
||||
|
||||
In this illustration, WAL is received as a stream from the Safekeeper, from the
|
||||
right. It is immediately captured by the page server and stored quickly in
|
||||
@@ -49,7 +47,7 @@ the same page and relation close to each other.
|
||||
From the page server memory, whenever enough WAL has been accumulated, it is flushed
|
||||
to disk into a new L0 layer file, and the memory is released.
|
||||
|
||||
When enough L0 files have been accumulated, they are merged together and sliced
|
||||
When enough L0 files have been accumulated, they are merged together rand sliced
|
||||
per key-space, producing a new set of files where each file contains a more
|
||||
narrow key range, but larger LSN range.
|
||||
|
||||
@@ -123,7 +121,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
|
||||
a range of LSNs (or a single LSN, in case of image layers). You can think of it
|
||||
as a rectangle in the two-dimensional key-LSN space. The layer files for each
|
||||
timeline are stored in the timeline's subdirectory under
|
||||
`.zenith/tenants/<tenantid>/timelines`.
|
||||
.zenith/tenants/<tenantid>/timelines.
|
||||
|
||||
There are two kind of layer files: images, and delta layers. An image file
|
||||
contains a snapshot of all keys at a particular LSN, whereas a delta file
|
||||
@@ -132,11 +130,8 @@ range of LSN.
|
||||
|
||||
image file:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000070B6-000000067F000032BE0000400000000080B6__00000000346BC568
|
||||
start key end key LSN
|
||||
```
|
||||
|
||||
|
||||
The first parts define the key range that the layer covers. See
|
||||
pgdatadir_mapping.rs for how the key space is used. The last part is the LSN.
|
||||
@@ -145,10 +140,8 @@ delta file:
|
||||
|
||||
Delta files are named similarly, but they cover a range of LSNs:
|
||||
|
||||
```
|
||||
000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
|
||||
start key end key start LSN end LSN
|
||||
```
|
||||
|
||||
A delta file contains all the key-values in the key-range that were updated in
|
||||
the LSN range. If a key has not been modified, there is no trace of it in the
|
||||
@@ -158,9 +151,7 @@ delta layer.
|
||||
A delta layer file can cover a part of the overall key space, as in the previous
|
||||
example, or the whole key range like this:
|
||||
|
||||
```
|
||||
000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__000000578C6B29-0000000057A50051
|
||||
```
|
||||
|
||||
A file that covers the whole key range is called a L0 file (Level 0), while a
|
||||
file that covers only part of the key range is called a L1 file. The "level" of
|
||||
@@ -177,9 +168,7 @@ version, and how branching and GC works is still valid.
|
||||
|
||||
The full path of a delta file looks like this:
|
||||
|
||||
```
|
||||
.zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
|
||||
```
|
||||
|
||||
For simplicity, the examples below use a simplified notation for the
|
||||
paths. The tenant ID is left out, the timeline ID is replaced with
|
||||
@@ -188,10 +177,8 @@ with a human-readable table name. The LSNs are also shorter. For
|
||||
example, a base image file at LSN 100 and a delta file between 100-200
|
||||
for 'orders' table on 'main' branch is represented like this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
```
|
||||
|
||||
|
||||
# Creating layer files
|
||||
@@ -201,14 +188,12 @@ branch called 'main' and two tables, 'orders' and 'customers'. The end
|
||||
of WAL is currently at LSN 250. In this starting situation, you would
|
||||
have these files on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
In addition to those files, the recent changes between LSN 200 and the
|
||||
end of WAL at 250 are kept in memory. If the page server crashes, the
|
||||
@@ -239,7 +224,6 @@ If the customers table is modified later, a new file is created for it
|
||||
at the next checkpoint. The new file will cover the "gap" from the
|
||||
last layer file, so the LSN ranges are always contiguous:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -252,7 +236,6 @@ last layer file, so the LSN ranges are always contiguous:
|
||||
main/customers_200
|
||||
main/customers_200_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
## Reading page versions
|
||||
|
||||
@@ -276,18 +259,15 @@ involves replaying any WAL records applicable to the page between LSNs
|
||||
|
||||
Imagine that a child branch is created at LSN 250:
|
||||
|
||||
```
|
||||
@250
|
||||
----main--+-------------------------->
|
||||
\
|
||||
+---child-------------->
|
||||
```
|
||||
|
||||
|
||||
Then, the 'orders' table is updated differently on the 'main' and
|
||||
'child' branches. You now have this situation on disk:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -302,7 +282,6 @@ Then, the 'orders' table is updated differently on the 'main' and
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
Because the 'customers' table hasn't been modified on the child
|
||||
branch, there is no file for it there. If you request a page for it on
|
||||
@@ -315,7 +294,6 @@ is linear, and the request's LSN identifies unambiguously which file
|
||||
you need to look at. For example, the history for the 'orders' table
|
||||
on the 'main' branch consists of these files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -323,12 +301,10 @@ on the 'main' branch consists of these files:
|
||||
main/orders_300
|
||||
main/orders_300_400
|
||||
main/orders_400
|
||||
```
|
||||
|
||||
And from the 'child' branch's point of view, it consists of these
|
||||
files:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -337,7 +313,6 @@ files:
|
||||
child/orders_300
|
||||
child/orders_300_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
The branch metadata includes the point where the child branch was
|
||||
created, LSN 250. If a page request comes with LSN 275, we read the
|
||||
@@ -370,7 +345,6 @@ Let's look at the single branch scenario again. Imagine that the end
|
||||
of the branch is LSN 525, so that the GC horizon is currently at
|
||||
525-150 = 375
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -383,13 +357,11 @@ of the branch is LSN 525, so that the GC horizon is currently at
|
||||
main/customers_100
|
||||
main/customers_100_200
|
||||
main/customers_200
|
||||
```
|
||||
|
||||
We can remove the following files because the end LSNs of those files are
|
||||
older than GC horizon 375, and there are more recent layer files for the
|
||||
table:
|
||||
|
||||
```
|
||||
main/orders_100 DELETE
|
||||
main/orders_100_200 DELETE
|
||||
main/orders_200 DELETE
|
||||
@@ -402,9 +374,8 @@ table:
|
||||
main/customers_100 DELETE
|
||||
main/customers_100_200 DELETE
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
'main/customers_200' is old enough, but it cannot be
|
||||
'main/customers_100_200' is old enough, but it cannot be
|
||||
removed because there is no newer layer file for the table.
|
||||
|
||||
Things get slightly more complicated with multiple branches. All of
|
||||
@@ -413,7 +384,6 @@ retain older shapshot files that are still needed by child branches.
|
||||
For example, if child branch is created at LSN 150, and the 'customers'
|
||||
table is updated on the branch, you would have these files:
|
||||
|
||||
```
|
||||
main/orders_100 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_100_200 KEEP, NEEDED BY child BRANCH
|
||||
main/orders_200 DELETE
|
||||
@@ -428,7 +398,6 @@ table is updated on the branch, you would have these files:
|
||||
main/customers_200 KEEP, NO NEWER VERSION
|
||||
child/customers_150_300 DELETE
|
||||
child/customers_300 KEEP, NO NEWER VERSION
|
||||
```
|
||||
|
||||
In this situation, 'main/orders_100' and 'main/orders_100_200' cannot
|
||||
be removed, even though they are older than the GC horizon, because
|
||||
@@ -438,7 +407,6 @@ and 'main/orders_200_300' can still be removed.
|
||||
If 'orders' is modified later on the 'child' branch, we will create a
|
||||
new base image and delta file for it on the child:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
|
||||
@@ -451,7 +419,6 @@ new base image and delta file for it on the child:
|
||||
child/customers_300
|
||||
child/orders_150_400
|
||||
child/orders_400
|
||||
```
|
||||
|
||||
After this, the 'main/orders_100' and 'main/orders_100_200' file could
|
||||
be removed. It is no longer needed by the child branch, because there
|
||||
@@ -467,7 +434,6 @@ Describe GC and checkpoint interval settings.
|
||||
In principle, each relation can be checkpointed separately, i.e. the
|
||||
LSN ranges of the files don't need to line up. So this would be legal:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
@@ -480,7 +446,6 @@ LSN ranges of the files don't need to line up. So this would be legal:
|
||||
main/customers_250
|
||||
main/customers_250_500
|
||||
main/customers_500
|
||||
```
|
||||
|
||||
However, the code currently always checkpoints all relations together.
|
||||
So that situation doesn't arise in practice.
|
||||
@@ -503,13 +468,11 @@ does that. It could be useful, however, as a transient state when
|
||||
garbage collecting around branch points, or explicit recovery
|
||||
points. For example, if we start with this:
|
||||
|
||||
```
|
||||
main/orders_100
|
||||
main/orders_100_200
|
||||
main/orders_200
|
||||
main/orders_200_300
|
||||
main/orders_300
|
||||
```
|
||||
|
||||
And there is a branch or explicit recovery point at LSN 150, we could
|
||||
replace 'main/orders_100_200' with 'main/orders_150' to keep a
|
||||
|
||||
@@ -1,20 +1,12 @@
|
||||
//!
|
||||
//! Functions for reading and writing variable-sized "blobs".
|
||||
//!
|
||||
//! Each blob begins with a 1- or 4-byte length field, followed by the
|
||||
//! actual data. If the length is smaller than 128 bytes, the length
|
||||
//! is written as a one byte. If it's larger than that, the length
|
||||
//! is written as a four-byte integer, in big-endian, with the high
|
||||
//! bit set. This way, we can detect whether it's 1- or 4-byte header
|
||||
//! by peeking at the first byte.
|
||||
//!
|
||||
//! len < 128: 0XXXXXXX
|
||||
//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
|
||||
//! Each blob begins with a 4-byte length, followed by the actual data.
|
||||
//!
|
||||
use crate::layered_repository::block_io::{BlockCursor, BlockReader};
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use std::cmp::min;
|
||||
use std::io::{Error, ErrorKind};
|
||||
use std::io::Error;
|
||||
|
||||
/// For reading
|
||||
pub trait BlobCursor {
|
||||
@@ -48,30 +40,21 @@ where
|
||||
|
||||
let mut buf = self.read_blk(blknum)?;
|
||||
|
||||
// peek at the first byte, to determine if it's a 1- or 4-byte length
|
||||
let first_len_byte = buf[off];
|
||||
let len: usize = if first_len_byte < 0x80 {
|
||||
// 1-byte length header
|
||||
off += 1;
|
||||
first_len_byte as usize
|
||||
// read length
|
||||
let mut len_buf = [0u8; 4];
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it is split across two pages
|
||||
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum)?;
|
||||
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
// 4-byte length header
|
||||
let mut len_buf = [0u8; 4];
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it is split across two pages
|
||||
len_buf[..thislen].copy_from_slice(&buf[off..PAGE_SZ]);
|
||||
blknum += 1;
|
||||
buf = self.read_blk(blknum)?;
|
||||
len_buf[thislen..].copy_from_slice(&buf[0..4 - thislen]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||
off += 4;
|
||||
}
|
||||
len_buf[0] &= 0x7f;
|
||||
u32::from_be_bytes(len_buf) as usize
|
||||
};
|
||||
len_buf.copy_from_slice(&buf[off..off + 4]);
|
||||
off += 4;
|
||||
}
|
||||
let len = u32::from_ne_bytes(len_buf) as usize;
|
||||
|
||||
dstbuf.clear();
|
||||
|
||||
@@ -147,27 +130,10 @@ where
|
||||
{
|
||||
fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, Error> {
|
||||
let offset = self.offset;
|
||||
|
||||
if srcbuf.len() < 128 {
|
||||
// Short blob. Write a 1-byte length header
|
||||
let len_buf = srcbuf.len() as u8;
|
||||
self.inner.write_all(&[len_buf])?;
|
||||
self.offset += 1;
|
||||
} else {
|
||||
// Write a 4-byte length header
|
||||
if srcbuf.len() > 0x7fff_ffff {
|
||||
return Err(Error::new(
|
||||
ErrorKind::Other,
|
||||
format!("blob too large ({} bytes)", srcbuf.len()),
|
||||
));
|
||||
}
|
||||
let mut len_buf = ((srcbuf.len()) as u32).to_be_bytes();
|
||||
len_buf[0] |= 0x80;
|
||||
self.inner.write_all(&len_buf)?;
|
||||
self.offset += 4;
|
||||
}
|
||||
self.inner
|
||||
.write_all(&((srcbuf.len()) as u32).to_ne_bytes())?;
|
||||
self.inner.write_all(srcbuf)?;
|
||||
self.offset += srcbuf.len() as u64;
|
||||
self.offset += 4 + srcbuf.len() as u64;
|
||||
Ok(offset)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,6 +198,7 @@ impl BlockWriter for BlockBuf {
|
||||
assert!(buf.len() == PAGE_SZ);
|
||||
let blknum = self.blocks.len();
|
||||
self.blocks.push(buf);
|
||||
tracing::info!("buffered block {}", blknum);
|
||||
Ok(blknum as u32)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,10 +35,14 @@ use crate::page_cache::{PageReadGuard, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::fs;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::io::{Seek, SeekFrom};
|
||||
@@ -46,13 +50,9 @@ use std::ops::Range;
|
||||
use std::os::unix::fs::FileExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -216,17 +216,12 @@ impl Layer for DeltaLayer {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_value_reconstruct_data(
|
||||
&self,
|
||||
key: Key,
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.lsn_range.start);
|
||||
let mut need_image = true;
|
||||
|
||||
ensure!(self.key_range.contains(&key));
|
||||
@@ -252,9 +247,6 @@ impl Layer for DeltaLayer {
|
||||
return false;
|
||||
}
|
||||
let entry_lsn = DeltaKey::extract_lsn_from_buf(key);
|
||||
if entry_lsn < lsn_range.start {
|
||||
return false;
|
||||
}
|
||||
offsets.push((entry_lsn, blob_ref.pos()));
|
||||
|
||||
!blob_ref.will_init()
|
||||
@@ -263,18 +255,8 @@ impl Layer for DeltaLayer {
|
||||
// Ok, 'offsets' now contains the offsets of all the entries we need to read
|
||||
let mut cursor = file.block_cursor();
|
||||
for (entry_lsn, pos) in offsets {
|
||||
let buf = cursor.read_blob(pos).with_context(|| {
|
||||
format!(
|
||||
"Failed to read blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let val = Value::des(&buf).with_context(|| {
|
||||
format!(
|
||||
"Failed to deserialize file blob from virtual file {}",
|
||||
file.file.path.display()
|
||||
)
|
||||
})?;
|
||||
let buf = cursor.read_blob(pos)?;
|
||||
let val = Value::des(&buf)?;
|
||||
match val {
|
||||
Value::Image(img) => {
|
||||
reconstruct_state.img = Some((entry_lsn, img));
|
||||
@@ -305,10 +287,7 @@ impl Layer for DeltaLayer {
|
||||
}
|
||||
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = anyhow::Result<(Key, Lsn, Value)>> + 'a> {
|
||||
let inner = match self.load() {
|
||||
Ok(inner) => inner,
|
||||
Err(e) => panic!("Failed to load a delta layer: {e:?}"),
|
||||
};
|
||||
let inner = self.load().unwrap();
|
||||
|
||||
match DeltaValueIter::new(inner) {
|
||||
Ok(iter) => Box::new(iter),
|
||||
@@ -363,28 +342,6 @@ impl Layer for DeltaLayer {
|
||||
tree_reader.dump()?;
|
||||
|
||||
let mut cursor = file.block_cursor();
|
||||
|
||||
// A subroutine to dump a single blob
|
||||
let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result<String> {
|
||||
let buf = cursor.read_blob(blob_ref.pos())?;
|
||||
let val = Value::des(&buf)?;
|
||||
let desc = match val {
|
||||
Value::Image(img) => {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(desc)
|
||||
};
|
||||
|
||||
tree_reader.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
@@ -393,10 +350,34 @@ impl Layer for DeltaLayer {
|
||||
let key = DeltaKey::extract_key_from_buf(delta_key);
|
||||
let lsn = DeltaKey::extract_lsn_from_buf(delta_key);
|
||||
|
||||
let desc = match dump_blob(blob_ref) {
|
||||
Ok(desc) => desc,
|
||||
Err(err) => format!("ERROR: {}", err),
|
||||
};
|
||||
let mut desc = String::new();
|
||||
match cursor.read_blob(blob_ref.pos()) {
|
||||
Ok(buf) => {
|
||||
let val = Value::des(&buf);
|
||||
match val {
|
||||
Ok(Value::Image(img)) => {
|
||||
write!(&mut desc, " img {} bytes", img.len()).unwrap();
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
rec.will_init(),
|
||||
wal_desc
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " DESERIALIZATION ERROR: {}", err).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(err) => {
|
||||
write!(&mut desc, " READ ERROR: {}", err).unwrap();
|
||||
}
|
||||
}
|
||||
println!(" key {} at {}: {}", key, lsn, desc);
|
||||
true
|
||||
},
|
||||
@@ -421,28 +402,6 @@ impl DeltaLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
key_start: Key,
|
||||
lsn_range: &Range<Lsn>,
|
||||
) -> PathBuf {
|
||||
let rand_string: String = rand::thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(8)
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timelineid, &tenantid).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.{}.temp",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
u64::from(lsn_range.end),
|
||||
rand_string
|
||||
))
|
||||
}
|
||||
|
||||
///
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
@@ -460,9 +419,7 @@ impl DeltaLayer {
|
||||
drop(inner);
|
||||
let inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(inner).with_context(|| {
|
||||
format!("Failed to load delta layer {}", self.path().display())
|
||||
})?;
|
||||
self.load_inner(inner)?;
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
@@ -630,8 +587,12 @@ impl DeltaLayerWriter {
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = DeltaLayer::temp_path_for(conf, timelineid, tenantid, key_start, &lsn_range);
|
||||
|
||||
let path = conf.timeline_path(&timelineid, &tenantid).join(format!(
|
||||
"{}-XXX__{:016X}-{:016X}.temp",
|
||||
key_start,
|
||||
u64::from(lsn_range.start),
|
||||
u64::from(lsn_range.end)
|
||||
));
|
||||
let mut file = VirtualFile::create(&path)?;
|
||||
// make room for the header block
|
||||
file.seek(SeekFrom::Start(PAGE_SZ as u64))?;
|
||||
@@ -724,8 +685,6 @@ impl DeltaLayerWriter {
|
||||
}),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
file.sync_all()?;
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
//! - page-oriented
|
||||
//!
|
||||
//! TODO:
|
||||
//! - better errors (e.g. with thiserror?)
|
||||
//! - maybe something like an Adaptive Radix Tree would be more efficient?
|
||||
//! - the values stored by image and delta layers are offsets into the file,
|
||||
//! and they are in monotonically increasing order. Prefix compression would
|
||||
@@ -18,12 +19,11 @@
|
||||
//! - An Iterator interface would be more convenient for the callers than the
|
||||
//! 'visit' function
|
||||
//!
|
||||
use anyhow;
|
||||
use byteorder::{ReadBytesExt, BE};
|
||||
use bytes::{BufMut, Bytes, BytesMut};
|
||||
use hex;
|
||||
use std::{cmp::Ordering, io, result};
|
||||
use thiserror::Error;
|
||||
use tracing::error;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::layered_repository::block_io::{BlockReader, BlockWriter};
|
||||
|
||||
@@ -86,23 +86,6 @@ impl Value {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum DiskBtreeError {
|
||||
#[error("Attempt to append a value that is too large {0} > {}", MAX_VALUE)]
|
||||
AppendOverflow(u64),
|
||||
|
||||
#[error("Unsorted input: key {key:?} is <= last_key {last_key:?}")]
|
||||
UnsortedInput { key: Box<[u8]>, last_key: Box<[u8]> },
|
||||
|
||||
#[error("Could not push to new leaf node")]
|
||||
FailedToPushToNewLeafNode,
|
||||
|
||||
#[error("IoError: {0}")]
|
||||
Io(#[from] io::Error),
|
||||
}
|
||||
|
||||
pub type Result<T> = result::Result<T, DiskBtreeError>;
|
||||
|
||||
/// This is the on-disk representation.
|
||||
struct OnDiskNode<'a, const L: usize> {
|
||||
// Fixed-width fields
|
||||
@@ -123,12 +106,12 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
///
|
||||
/// Interpret a PAGE_SZ page as a node.
|
||||
///
|
||||
fn deparse(buf: &[u8]) -> Result<OnDiskNode<L>> {
|
||||
fn deparse(buf: &[u8]) -> OnDiskNode<L> {
|
||||
let mut cursor = std::io::Cursor::new(buf);
|
||||
let num_children = cursor.read_u16::<BE>()?;
|
||||
let level = cursor.read_u8()?;
|
||||
let prefix_len = cursor.read_u8()?;
|
||||
let suffix_len = cursor.read_u8()?;
|
||||
let num_children = cursor.read_u16::<BE>().unwrap();
|
||||
let level = cursor.read_u8().unwrap();
|
||||
let prefix_len = cursor.read_u8().unwrap();
|
||||
let suffix_len = cursor.read_u8().unwrap();
|
||||
|
||||
let mut off = cursor.position();
|
||||
let prefix_off = off as usize;
|
||||
@@ -146,7 +129,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
let keys = &buf[keys_off..keys_off + keys_len];
|
||||
let values = &buf[values_off..values_off + values_len];
|
||||
|
||||
Ok(OnDiskNode {
|
||||
OnDiskNode {
|
||||
num_children,
|
||||
level,
|
||||
prefix_len,
|
||||
@@ -154,7 +137,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
prefix,
|
||||
keys,
|
||||
values,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -166,11 +149,7 @@ impl<'a, const L: usize> OnDiskNode<'a, L> {
|
||||
Value::from_slice(value_slice)
|
||||
}
|
||||
|
||||
fn binary_search(
|
||||
&self,
|
||||
search_key: &[u8; L],
|
||||
keybuf: &mut [u8],
|
||||
) -> result::Result<usize, usize> {
|
||||
fn binary_search(&self, search_key: &[u8; L], keybuf: &mut [u8]) -> Result<usize, usize> {
|
||||
let mut size = self.num_children as usize;
|
||||
let mut low = 0;
|
||||
let mut high = size;
|
||||
@@ -230,7 +209,7 @@ where
|
||||
///
|
||||
/// Read the value for given key. Returns the value, or None if it doesn't exist.
|
||||
///
|
||||
pub fn get(&self, search_key: &[u8; L]) -> Result<Option<u64>> {
|
||||
pub fn get(&self, search_key: &[u8; L]) -> anyhow::Result<Option<u64>> {
|
||||
let mut result: Option<u64> = None;
|
||||
self.visit(search_key, VisitDirection::Forwards, |key, value| {
|
||||
if key == search_key {
|
||||
@@ -251,7 +230,7 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
mut visitor: V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
@@ -264,7 +243,7 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
@@ -281,11 +260,11 @@ where
|
||||
search_key: &[u8; L],
|
||||
dir: VisitDirection,
|
||||
visitor: &mut V,
|
||||
) -> Result<bool>
|
||||
) -> anyhow::Result<bool>
|
||||
where
|
||||
V: FnMut(&[u8], u64) -> bool,
|
||||
{
|
||||
let node = OnDiskNode::deparse(node_buf)?;
|
||||
let node = OnDiskNode::deparse(node_buf);
|
||||
let prefix_len = node.prefix_len as usize;
|
||||
let suffix_len = node.suffix_len as usize;
|
||||
|
||||
@@ -390,15 +369,15 @@ where
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn dump(&self) -> Result<()> {
|
||||
pub fn dump(&self) -> anyhow::Result<()> {
|
||||
self.dump_recurse(self.root_blk, &[], 0)
|
||||
}
|
||||
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
|
||||
fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> anyhow::Result<()> {
|
||||
let blk = self.reader.read_blk(self.start_blk + blknum)?;
|
||||
let buf: &[u8] = blk.as_ref();
|
||||
|
||||
let node = OnDiskNode::<L>::deparse(buf)?;
|
||||
let node = OnDiskNode::<L>::deparse(buf);
|
||||
|
||||
print!("{:indent$}", "", indent = depth * 2);
|
||||
println!(
|
||||
@@ -444,13 +423,6 @@ where
|
||||
///
|
||||
/// stack[0] is the current root page, stack.last() is the leaf.
|
||||
///
|
||||
/// We maintain the length of the stack to be always greater than zero.
|
||||
/// Two exceptions are:
|
||||
/// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
|
||||
/// So because other methods cannot see the intermediate state invariant still holds.
|
||||
/// 2. `Self::finish`. It consumes self and does not return it back,
|
||||
/// which means that this is where the structure is destroyed.
|
||||
/// Thus stack of zero length cannot be observed by other methods.
|
||||
stack: Vec<BuildNode<L>>,
|
||||
|
||||
/// Last key that was appended to the tree. Used to sanity check that append
|
||||
@@ -470,29 +442,19 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<()> {
|
||||
if value > MAX_VALUE {
|
||||
return Err(DiskBtreeError::AppendOverflow(value));
|
||||
}
|
||||
pub fn append(&mut self, key: &[u8; L], value: u64) -> Result<(), anyhow::Error> {
|
||||
assert!(value <= MAX_VALUE);
|
||||
if let Some(last_key) = &self.last_key {
|
||||
if key <= last_key {
|
||||
return Err(DiskBtreeError::UnsortedInput {
|
||||
key: key.as_slice().into(),
|
||||
last_key: last_key.as_slice().into(),
|
||||
});
|
||||
}
|
||||
assert!(key > last_key, "unsorted input");
|
||||
}
|
||||
self.last_key = Some(*key);
|
||||
|
||||
self.append_internal(key, Value::from_u64(value))
|
||||
Ok(self.append_internal(key, Value::from_u64(value))?)
|
||||
}
|
||||
|
||||
fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<()> {
|
||||
fn append_internal(&mut self, key: &[u8; L], value: Value) -> Result<(), std::io::Error> {
|
||||
// Try to append to the current leaf buffer
|
||||
let last = self
|
||||
.stack
|
||||
.last_mut()
|
||||
.expect("should always have at least one item");
|
||||
let last = self.stack.last_mut().unwrap();
|
||||
let level = last.level;
|
||||
if last.push(key, value) {
|
||||
return Ok(());
|
||||
@@ -514,33 +476,26 @@ where
|
||||
// key to it.
|
||||
let mut last = BuildNode::new(level);
|
||||
if !last.push(key, value) {
|
||||
return Err(DiskBtreeError::FailedToPushToNewLeafNode);
|
||||
panic!("could not push to new leaf node");
|
||||
}
|
||||
|
||||
self.stack.push(last);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Flush the bottommost node in the stack to disk. Appends a downlink to its parent,
|
||||
/// and recursively flushes the parent too, if it becomes full. If the root page becomes full,
|
||||
/// creates a new root page, increasing the height of the tree.
|
||||
fn flush_node(&mut self) -> Result<()> {
|
||||
// Get the current bottommost node in the stack and flush it to disk.
|
||||
let last = self
|
||||
.stack
|
||||
.pop()
|
||||
.expect("should always have at least one item");
|
||||
fn flush_node(&mut self) -> Result<(), std::io::Error> {
|
||||
let last = self.stack.pop().unwrap();
|
||||
let buf = last.pack();
|
||||
let downlink_key = last.first_key();
|
||||
let downlink_ptr = self.writer.write_blk(buf)?;
|
||||
|
||||
// Append the downlink to the parent. If there is no parent, ie. this was the root page,
|
||||
// create a new root page, increasing the height of the tree.
|
||||
// Append the downlink to the parent
|
||||
if self.stack.is_empty() {
|
||||
self.stack.push(BuildNode::new(last.level + 1));
|
||||
}
|
||||
self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))
|
||||
self.append_internal(&downlink_key, Value::from_blknum(downlink_ptr))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
///
|
||||
@@ -550,16 +505,13 @@ where
|
||||
/// (In the image and delta layers, it is stored in the beginning of the file,
|
||||
/// in the summary header)
|
||||
///
|
||||
pub fn finish(mut self) -> Result<(u32, W)> {
|
||||
pub fn finish(mut self) -> Result<(u32, W), std::io::Error> {
|
||||
// flush all levels, except the root.
|
||||
while self.stack.len() > 1 {
|
||||
self.flush_node()?;
|
||||
}
|
||||
|
||||
let root = self
|
||||
.stack
|
||||
.first()
|
||||
.expect("by the check above we left one item there");
|
||||
let root = self.stack.first().unwrap();
|
||||
let buf = root.pack();
|
||||
let root_blknum = self.writer.write_blk(buf)?;
|
||||
|
||||
@@ -740,14 +692,14 @@ mod tests {
|
||||
impl BlockReader for TestDisk {
|
||||
type BlockLease = std::rc::Rc<[u8; PAGE_SZ]>;
|
||||
|
||||
fn read_blk(&self, blknum: u32) -> io::Result<Self::BlockLease> {
|
||||
fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, std::io::Error> {
|
||||
let mut buf = [0u8; PAGE_SZ];
|
||||
buf.copy_from_slice(&self.blocks[blknum as usize]);
|
||||
Ok(std::rc::Rc::new(buf))
|
||||
}
|
||||
}
|
||||
impl BlockWriter for &mut TestDisk {
|
||||
fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
|
||||
fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error> {
|
||||
let blknum = self.blocks.len();
|
||||
self.blocks.push(buf);
|
||||
Ok(blknum as u32)
|
||||
@@ -755,7 +707,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic() -> Result<()> {
|
||||
fn basic() -> anyhow::Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
|
||||
|
||||
@@ -836,7 +788,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lots_of_keys() -> Result<()> {
|
||||
fn lots_of_keys() -> anyhow::Result<()> {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
|
||||
|
||||
@@ -930,7 +882,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_data() -> Result<()> {
|
||||
fn random_data() -> anyhow::Result<()> {
|
||||
// Generate random keys with exponential distribution, to
|
||||
// exercise the prefix compression
|
||||
const NUM_KEYS: usize = 100000;
|
||||
@@ -975,27 +927,21 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "unsorted input")]
|
||||
fn unsorted_input() {
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 2>::new(&mut disk);
|
||||
|
||||
let _ = writer.append(b"ba", 1);
|
||||
let _ = writer.append(b"bb", 2);
|
||||
let err = writer.append(b"aa", 3).expect_err("should've failed");
|
||||
match err {
|
||||
DiskBtreeError::UnsortedInput { key, last_key } => {
|
||||
assert_eq!(key.as_ref(), b"aa".as_slice());
|
||||
assert_eq!(last_key.as_ref(), b"bb".as_slice());
|
||||
}
|
||||
_ => panic!("unexpected error variant, expected DiskBtreeError::UnsortedInput"),
|
||||
}
|
||||
let _ = writer.append(b"aa", 3);
|
||||
}
|
||||
|
||||
///
|
||||
/// This test contains a particular data set, see disk_btree_test_data.rs
|
||||
///
|
||||
#[test]
|
||||
fn particular_data() -> Result<()> {
|
||||
fn particular_data() -> anyhow::Result<()> {
|
||||
// Build a tree from it
|
||||
let mut disk = TestDisk::new();
|
||||
let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
|
||||
|
||||
@@ -16,8 +16,8 @@ use std::io::{Error, ErrorKind};
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use tracing::*;
|
||||
use utils::zid::{ZTenantId, ZTimelineId};
|
||||
use zenith_utils::zid::ZTenantId;
|
||||
use zenith_utils::zid::ZTimelineId;
|
||||
|
||||
use std::os::unix::fs::FileExt;
|
||||
|
||||
@@ -199,24 +199,18 @@ impl BlobWriter for EphemeralFile {
|
||||
let mut buf = self.get_buf_for_write(blknum)?;
|
||||
|
||||
// Write the length field
|
||||
if srcbuf.len() < 0x80 {
|
||||
buf[off] = srcbuf.len() as u8;
|
||||
off += 1;
|
||||
let len_buf = u32::to_ne_bytes(srcbuf.len() as u32);
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
let mut len_buf = u32::to_be_bytes(srcbuf.len() as u32);
|
||||
len_buf[0] |= 0x80;
|
||||
let thislen = PAGE_SZ - off;
|
||||
if thislen < 4 {
|
||||
// it needs to be split across pages
|
||||
buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
|
||||
blknum += 1;
|
||||
buf = self.get_buf_for_write(blknum)?;
|
||||
buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
|
||||
off = 4 - thislen;
|
||||
} else {
|
||||
buf[off..off + 4].copy_from_slice(&len_buf);
|
||||
off += 4;
|
||||
}
|
||||
buf[off..off + 4].copy_from_slice(&len_buf);
|
||||
off += 4;
|
||||
}
|
||||
|
||||
// Write the payload
|
||||
@@ -235,13 +229,7 @@ impl BlobWriter for EphemeralFile {
|
||||
buf_remain = &buf_remain[this_blk_len..];
|
||||
}
|
||||
drop(buf);
|
||||
|
||||
if srcbuf.len() < 0x80 {
|
||||
self.size += 1;
|
||||
} else {
|
||||
self.size += 4;
|
||||
}
|
||||
self.size += srcbuf.len() as u64;
|
||||
self.size += 4 + srcbuf.len() as u64;
|
||||
|
||||
Ok(pos)
|
||||
}
|
||||
@@ -256,31 +244,16 @@ impl Drop for EphemeralFile {
|
||||
// remove entry from the hash map
|
||||
EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);
|
||||
|
||||
// unlink the file
|
||||
let res = std::fs::remove_file(&self.file.path);
|
||||
if let Err(e) = res {
|
||||
warn!(
|
||||
"could not remove ephemeral file '{}': {}",
|
||||
self.file.path.display(),
|
||||
e
|
||||
);
|
||||
}
|
||||
// unlink file
|
||||
// FIXME: print error
|
||||
let _ = std::fs::remove_file(&self.file.path);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), std::io::Error> {
|
||||
if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
|
||||
match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
|
||||
Ok(_) => Ok(()),
|
||||
Err(e) => Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
format!(
|
||||
"failed to write back to ephemeral file at {} error: {}",
|
||||
file.path.display(),
|
||||
e
|
||||
),
|
||||
)),
|
||||
}
|
||||
file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Err(std::io::Error::new(
|
||||
ErrorKind::Other,
|
||||
@@ -399,12 +372,6 @@ mod tests {
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
// also test with a large blobs
|
||||
for i in 0..100 {
|
||||
let data = format!("blob{}", i).as_bytes().repeat(100);
|
||||
let pos = file.write_blob(&data)?;
|
||||
blobs.push((pos, data));
|
||||
}
|
||||
|
||||
let mut cursor = BlockCursor::new(&file);
|
||||
for (pos, expected) in blobs {
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
// Note: LayeredTimeline::load_layer_map() relies on this sort order
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
|
||||
@@ -30,11 +30,12 @@ use crate::layered_repository::storage_layer::{
|
||||
use crate::page_cache::PAGE_SZ;
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::virtual_file::VirtualFile;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{bail, ensure, Context, Result};
|
||||
use bytes::Bytes;
|
||||
use hex;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use log::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::fs;
|
||||
use std::io::Write;
|
||||
@@ -42,13 +43,9 @@ use std::io::{Seek, SeekFrom};
|
||||
use std::ops::Range;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::{RwLock, RwLockReadGuard};
|
||||
use tracing::*;
|
||||
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
///
|
||||
/// Header stored in the beginning of the file
|
||||
@@ -126,10 +123,6 @@ impl Layer for ImageLayer {
|
||||
PathBuf::from(self.layer_name().to_string())
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
Some(self.path())
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
@@ -155,7 +148,6 @@ impl Layer for ImageLayer {
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
assert!(self.key_range.contains(&key));
|
||||
assert!(lsn_range.start >= self.lsn);
|
||||
assert!(lsn_range.end >= self.lsn);
|
||||
|
||||
let inner = self.load()?;
|
||||
@@ -242,22 +234,6 @@ impl ImageLayer {
|
||||
}
|
||||
}
|
||||
|
||||
fn temp_path_for(
|
||||
conf: &PageServerConf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
fname: &ImageFileName,
|
||||
) -> PathBuf {
|
||||
let rand_string: String = rand::thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(8)
|
||||
.map(char::from)
|
||||
.collect();
|
||||
|
||||
conf.timeline_path(&timelineid, &tenantid)
|
||||
.join(format!("{}.{}.temp", fname, rand_string))
|
||||
}
|
||||
|
||||
///
|
||||
/// Open the underlying file and read the metadata into memory, if it's
|
||||
/// not loaded already.
|
||||
@@ -275,9 +251,7 @@ impl ImageLayer {
|
||||
drop(inner);
|
||||
let mut inner = self.inner.write().unwrap();
|
||||
if !inner.loaded {
|
||||
self.load_inner(&mut inner).with_context(|| {
|
||||
format!("Failed to load image layer {}", self.path().display())
|
||||
})?
|
||||
self.load_inner(&mut inner)?;
|
||||
} else {
|
||||
// Another thread loaded it while we were not holding the lock.
|
||||
}
|
||||
@@ -415,7 +389,7 @@ impl ImageLayer {
|
||||
///
|
||||
pub struct ImageLayerWriter {
|
||||
conf: &'static PageServerConf,
|
||||
path: PathBuf,
|
||||
_path: PathBuf,
|
||||
timelineid: ZTimelineId,
|
||||
tenantid: ZTenantId,
|
||||
key_range: Range<Key>,
|
||||
@@ -433,10 +407,12 @@ impl ImageLayerWriter {
|
||||
key_range: &Range<Key>,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<ImageLayerWriter> {
|
||||
// Create the file initially with a temporary filename.
|
||||
// We'll atomically rename it to the final name when we're done.
|
||||
let path = ImageLayer::temp_path_for(
|
||||
conf,
|
||||
// Create the file
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let path = ImageLayer::path_for(
|
||||
&PathOrConf::Conf(conf),
|
||||
timelineid,
|
||||
tenantid,
|
||||
&ImageFileName {
|
||||
@@ -456,7 +432,7 @@ impl ImageLayerWriter {
|
||||
|
||||
let writer = ImageLayerWriter {
|
||||
conf,
|
||||
path,
|
||||
_path: path,
|
||||
timelineid,
|
||||
tenantid,
|
||||
key_range: key_range.clone(),
|
||||
@@ -527,25 +503,6 @@ impl ImageLayerWriter {
|
||||
index_root_blk,
|
||||
}),
|
||||
};
|
||||
|
||||
// fsync the file
|
||||
file.sync_all()?;
|
||||
|
||||
// Rename the file to its final name
|
||||
//
|
||||
// Note: This overwrites any existing file. There shouldn't be any.
|
||||
// FIXME: throw an error instead?
|
||||
let final_path = ImageLayer::path_for(
|
||||
&PathOrConf::Conf(self.conf),
|
||||
self.timelineid,
|
||||
self.tenantid,
|
||||
&ImageFileName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn: self.lsn,
|
||||
},
|
||||
);
|
||||
std::fs::rename(self.path, &final_path)?;
|
||||
|
||||
trace!("created image layer {}", layer.path().display());
|
||||
|
||||
Ok(layer)
|
||||
|
||||
@@ -14,21 +14,19 @@ use crate::layered_repository::storage_layer::{
|
||||
};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord;
|
||||
use crate::{ZTenantId, ZTimelineId};
|
||||
use anyhow::{bail, ensure, Result};
|
||||
use log::*;
|
||||
use std::collections::HashMap;
|
||||
use tracing::*;
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
lsn::Lsn,
|
||||
vec_map::VecMap,
|
||||
zid::{ZTenantId, ZTimelineId},
|
||||
};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use std::fmt::Write as _;
|
||||
use std::ops::Range;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use zenith_utils::bin_ser::BeSer;
|
||||
use zenith_utils::lsn::Lsn;
|
||||
use zenith_utils::vec_map::VecMap;
|
||||
|
||||
pub struct InMemoryLayer {
|
||||
conf: &'static PageServerConf,
|
||||
@@ -85,10 +83,6 @@ impl Layer for InMemoryLayer {
|
||||
))
|
||||
}
|
||||
|
||||
fn local_path(&self) -> Option<PathBuf> {
|
||||
None
|
||||
}
|
||||
|
||||
fn get_tenant_id(&self) -> ZTenantId {
|
||||
self.tenantid
|
||||
}
|
||||
@@ -119,7 +113,7 @@ impl Layer for InMemoryLayer {
|
||||
lsn_range: Range<Lsn>,
|
||||
reconstruct_state: &mut ValueReconstructState,
|
||||
) -> anyhow::Result<ValueReconstructResult> {
|
||||
ensure!(lsn_range.start >= self.start_lsn);
|
||||
ensure!(lsn_range.start <= self.start_lsn);
|
||||
let mut need_image = true;
|
||||
|
||||
let inner = self.inner.read().unwrap();
|
||||
@@ -130,6 +124,13 @@ impl Layer for InMemoryLayer {
|
||||
if let Some(vec_map) = inner.index.get(&key) {
|
||||
let slice = vec_map.slice_range(lsn_range);
|
||||
for (entry_lsn, pos) in slice.iter().rev() {
|
||||
match &reconstruct_state.img {
|
||||
Some((cached_lsn, _)) if entry_lsn <= cached_lsn => {
|
||||
return Ok(ValueReconstructResult::Complete)
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let buf = reader.read_blob(*pos)?;
|
||||
let value = Value::des(&buf)?;
|
||||
match value {
|
||||
@@ -211,7 +212,7 @@ impl Layer for InMemoryLayer {
|
||||
write!(&mut desc, " img {} bytes", img.len())?;
|
||||
}
|
||||
Ok(Value::WalRecord(rec)) => {
|
||||
let wal_desc = walrecord::describe_wal_record(&rec).unwrap();
|
||||
let wal_desc = walrecord::describe_wal_record(&rec);
|
||||
write!(
|
||||
&mut desc,
|
||||
" rec {} bytes will_init: {} {}",
|
||||
|
||||
@@ -16,12 +16,12 @@ use crate::layered_repository::InMemoryLayer;
|
||||
use crate::repository::Key;
|
||||
use anyhow::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use metrics::{register_int_gauge, IntGauge};
|
||||
use std::collections::VecDeque;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use zenith_metrics::{register_int_gauge, IntGauge};
|
||||
use zenith_utils::lsn::Lsn;
|
||||
|
||||
lazy_static! {
|
||||
static ref NUM_ONDISK_LAYERS: IntGauge =
|
||||
@@ -43,13 +43,10 @@ pub struct LayerMap {
|
||||
pub next_open_layer_at: Option<Lsn>,
|
||||
|
||||
///
|
||||
/// Frozen layers, if any. Frozen layers are in-memory layers that
|
||||
/// are no longer added to, but haven't been written out to disk
|
||||
/// yet. They contain WAL older than the current 'open_layer' or
|
||||
/// 'next_open_layer_at', but newer than any historic layer.
|
||||
/// The frozen layers are in order from oldest to newest, so that
|
||||
/// the newest one is in the 'back' of the VecDeque, and the oldest
|
||||
/// in the 'front'.
|
||||
/// The frozen layer, if any, contains WAL older than the current 'open_layer'
|
||||
/// or 'next_open_layer_at', but newer than any historic layer. The frozen
|
||||
/// layer is during checkpointing, when an InMemoryLayer is being written out
|
||||
/// to disk.
|
||||
///
|
||||
pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
|
||||
|
||||
@@ -132,15 +129,17 @@ impl LayerMap {
|
||||
// this layer contains the requested point in the key/lsn space.
|
||||
// No need to search any further
|
||||
trace!(
|
||||
"found layer {} for request on {key} at {end_lsn}",
|
||||
"found layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
break;
|
||||
}
|
||||
// this layer's end LSN is smaller than the requested point. If there's
|
||||
// nothing newer, this is what we need to return. Remember this.
|
||||
if let Some(old_candidate) = &latest_delta {
|
||||
if let Some(ref old_candidate) = latest_delta {
|
||||
if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
|
||||
latest_delta.replace(Arc::clone(l));
|
||||
}
|
||||
@@ -150,8 +149,10 @@ impl LayerMap {
|
||||
}
|
||||
if let Some(l) = latest_delta {
|
||||
trace!(
|
||||
"found (old) layer {} for request on {key} at {end_lsn}",
|
||||
"found (old) layer {} for request on {} at {}",
|
||||
l.filename().display(),
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
let lsn_floor = std::cmp::max(
|
||||
Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
|
||||
@@ -162,13 +163,17 @@ impl LayerMap {
|
||||
layer: l,
|
||||
}))
|
||||
} else if let Some(l) = latest_img {
|
||||
trace!("found img layer and no deltas for request on {key} at {end_lsn}");
|
||||
trace!(
|
||||
"found img layer and no deltas for request on {} at {}",
|
||||
key,
|
||||
end_lsn
|
||||
);
|
||||
Ok(Some(SearchResult {
|
||||
lsn_floor: latest_img_lsn.unwrap(),
|
||||
layer: l,
|
||||
}))
|
||||
} else {
|
||||
trace!("no layer found for request on {key} at {end_lsn}");
|
||||
trace!("no layer found for request on {} at {}", key, end_lsn);
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@@ -186,6 +191,7 @@ impl LayerMap {
|
||||
///
|
||||
/// This should be called when the corresponding file on disk has been deleted.
|
||||
///
|
||||
#[allow(dead_code)]
|
||||
pub fn remove_historic(&mut self, layer: Arc<dyn Layer>) {
|
||||
let len_before = self.historic_layers.len();
|
||||
|
||||
@@ -201,14 +207,18 @@ impl LayerMap {
|
||||
NUM_ONDISK_LAYERS.dec();
|
||||
}
|
||||
|
||||
/// Is there a newer image layer for given key- and LSN-range?
|
||||
/// Is there a newer image layer for given key-range?
|
||||
///
|
||||
/// This is used for garbage collection, to determine if an old layer can
|
||||
/// be deleted.
|
||||
pub fn image_layer_exists(
|
||||
/// We ignore layers newer than disk_consistent_lsn because they will be removed at restart
|
||||
/// We also only look at historic layers
|
||||
//#[allow(dead_code)]
|
||||
pub fn newer_image_layer_exists(
|
||||
&self,
|
||||
key_range: &Range<Key>,
|
||||
lsn_range: &Range<Lsn>,
|
||||
lsn: Lsn,
|
||||
disk_consistent_lsn: Lsn,
|
||||
) -> Result<bool> {
|
||||
let mut range_remain = key_range.clone();
|
||||
|
||||
@@ -221,7 +231,8 @@ impl LayerMap {
|
||||
let img_lsn = l.get_lsn_range().start;
|
||||
if !l.is_incremental()
|
||||
&& l.get_key_range().contains(&range_remain.start)
|
||||
&& lsn_range.contains(&img_lsn)
|
||||
&& img_lsn > lsn
|
||||
&& img_lsn < disk_consistent_lsn
|
||||
{
|
||||
made_progress = true;
|
||||
let img_key_end = l.get_key_range().end;
|
||||
@@ -239,7 +250,7 @@ impl LayerMap {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn iter_historic_layers(&self) -> impl Iterator<Item = &Arc<dyn Layer>> {
|
||||
pub fn iter_historic_layers(&self) -> std::slice::Iter<Arc<dyn Layer>> {
|
||||
self.historic_layers.iter()
|
||||
}
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user