Compare commits

..

9 Commits

Author SHA1 Message Date
Konstantin Knizhnik
0a049ae17a Not working version 2022-05-05 08:53:21 +03:00
Konstantin Knizhnik
32557b16b4 Use prepared dictionary for layer reconstruction 2022-05-04 18:17:33 +03:00
Konstantin Knizhnik
076b8e3d04 Use zstd::bulk::Decompressor::decompress instead decompredd_to_buffer 2022-05-03 11:28:32 +03:00
Konstantin Knizhnik
39eadf6236 Use zstd::bulk::Decompressor to decode WAL records to minimize number of context initalization 2022-05-03 09:59:33 +03:00
Heikki Linnakangas
4472d49c1e Reuse the zstd Compressor context when building delta layer. 2022-05-03 01:47:39 +03:00
Konstantin Knizhnik
dc057ace2f Fix formatting 2022-05-02 07:58:07 +03:00
Konstantin Knizhnik
0e49d748b8 Fix bug in dictinary creation 2022-05-02 07:58:07 +03:00
Konstantin Knizhnik
fc7d1ba043 Do not compress delta layers if there are too few elements 2022-05-02 07:58:07 +03:00
Konstantin Knizhnik
e28b3dee37 Implement compression of image and delta layers 2022-05-02 07:58:07 +03:00
376 changed files with 14263 additions and 30896 deletions

View File

@@ -1,13 +0,0 @@
# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
# optimizations enabled by "opt-level=1" don't affect debuggability too much.
#
# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
#
[profile.dev.package."*"]
# Set the default for dependencies in Development mode.
opt-level = 3
[profile.dev]
# Turn on a small amount of optimization in Development mode.
opt-level = 1

View File

@@ -1,4 +1,2 @@
zenith_install.tar.gz
.zenith_current_version
neon_install.tar.gz
.neon_current_version

View File

@@ -6,7 +6,5 @@ timeout = 30
[ssh_connection]
ssh_args = -F ./ansible.ssh.cfg
# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
# and scp neither worked for me
transfer_method = piped
scp_if_ssh = True
pipelining = True

View File

@@ -1,7 +1,3 @@
# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
# (use pre 8.5 option name to cope with old ssh in CI)
PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
Host tele.zenith.tech
User admin
Port 3023

View File

@@ -57,7 +57,7 @@
args:
creates: "/storage/pageserver/data/tenants"
environment:
NEON_REPO_DIR: "/storage/pageserver/data"
ZENITH_REPO_DIR: "/storage/pageserver/data"
LD_LIBRARY_PATH: "/usr/local/lib"
become: true
tags:
@@ -131,7 +131,7 @@
args:
creates: "/storage/safekeeper/data/safekeeper.id"
environment:
NEON_REPO_DIR: "/storage/safekeeper/data"
ZENITH_REPO_DIR: "/storage/safekeeper/data"
LD_LIBRARY_PATH: "/usr/local/lib"
become: true
tags:

View File

@@ -6,8 +6,8 @@ RELEASE=${RELEASE:-false}
# look at docker hub for latest tag for neon docker image
if [ "${RELEASE}" = "true" ]; then
echo "search latest release tag"
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | grep -E '^[0-9]+$' | sort -n | tail -1)
echo "search latest relase tag"
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep release | sed 's/release-//g' | tail -1)
if [ -z "${VERSION}" ]; then
echo "no any docker tags found, exiting..."
exit 1
@@ -16,7 +16,7 @@ if [ "${RELEASE}" = "true" ]; then
fi
else
echo "search latest dev tag"
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -E '^[0-9]+$' | sort -n | tail -1)
VERSION=$(curl -s https://registry.hub.docker.com/v1/repositories/neondatabase/neon/tags |jq -r -S '.[].name' | grep -v release | tail -1)
if [ -z "${VERSION}" ]; then
echo "no any docker tags found, exiting..."
exit 1
@@ -31,7 +31,7 @@ echo "found ${VERSION}"
rm -rf neon_install postgres_install.tar.gz neon_install.tar.gz .neon_current_version
mkdir neon_install
# retrieve binaries from docker image
# retrive binaries from docker image
echo "getting binaries from docker image"
docker pull --quiet neondatabase/neon:${TAG}
ID=$(docker create neondatabase/neon:${TAG})

View File

@@ -1,7 +1,5 @@
[pageservers]
#zenith-1-ps-1 console_region_id=1
zenith-1-ps-2 console_region_id=1
zenith-1-ps-3 console_region_id=1
zenith-1-ps-1 console_region_id=1
[safekeepers]
zenith-1-sk-1 console_region_id=1
@@ -13,8 +11,7 @@ pageservers
safekeepers
[storage:vars]
env_name = prod-1
console_mgmt_base_url = http://console-release.local
bucket_name = zenith-storage-oregon
bucket_region = us-west-2
etcd_endpoints = zenith-1-etcd.local:2379
etcd_endpoints = etcd-release.local:2379

View File

@@ -12,9 +12,10 @@ cat <<EOF | tee /tmp/payload
"version": 1,
"host": "${HOST}",
"port": 6500,
"http_port": 7676,
"region_id": {{ console_region_id }},
"instance_id": "${INSTANCE_ID}"
"instance_id": "${INSTANCE_ID}",
"http_host": "${HOST}",
"http_port": 7676
}
EOF

View File

@@ -1,20 +1,18 @@
[pageservers]
#zenith-us-stage-ps-1 console_region_id=27
zenith-us-stage-ps-2 console_region_id=27
zenith-us-stage-ps-3 console_region_id=27
[safekeepers]
zenith-us-stage-sk-1 console_region_id=27
zenith-us-stage-sk-2 console_region_id=27
zenith-us-stage-sk-4 console_region_id=27
zenith-us-stage-sk-5 console_region_id=27
zenith-us-stage-sk-6 console_region_id=27
[storage:children]
pageservers
safekeepers
[storage:vars]
env_name = us-stage
console_mgmt_base_url = http://console-staging.local
bucket_name = zenith-staging-storage-us-east-1
bucket_region = us-east-1
etcd_endpoints = zenith-us-stage-etcd.local:2379
etcd_endpoints = etcd-staging.local:2379

View File

@@ -5,8 +5,8 @@ After=network.target auditd.service
[Service]
Type=simple
User=pageserver
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /storage/pageserver/data
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT

View File

@@ -0,0 +1,18 @@
[Unit]
Description=Zenith safekeeper
After=network.target auditd.service
[Service]
Type=simple
User=safekeeper
Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }}
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT
Restart=on-failure
TimeoutSec=10
LimitNOFILE=30000000
[Install]
WantedBy=multi-user.target

824
.circleci/config.yml Normal file
View File

@@ -0,0 +1,824 @@
version: 2.1
executors:
neon-xlarge-executor:
resource_class: xlarge
docker:
# NB: when changed, do not forget to update rust image tag in all Dockerfiles
- image: zimg/rust:1.58
neon-executor:
docker:
- image: zimg/rust:1.58
jobs:
check-codestyle-rust:
executor: neon-xlarge-executor
steps:
- checkout
- run:
name: rustfmt
when: always
command: cargo fmt --all -- --check
# A job to build postgres
build-postgres:
executor: neon-xlarge-executor
parameters:
build_type:
type: enum
enum: ["debug", "release"]
environment:
BUILD_TYPE: << parameters.build_type >>
steps:
# Checkout the git repo (circleci doesn't have a flag to enable submodules here)
- checkout
# Grab the postgres git revision to build a cache key.
# Append makefile as it could change the way postgres is built.
# Note this works even though the submodule hasn't been checkout out yet.
- run:
name: Get postgres cache key
command: |
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
cat Makefile >> /tmp/cache-key-postgres
- restore_cache:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
# Build postgres if the restore_cache didn't find a build.
# `make` can't figure out whether the cache is valid, since
# it only compares file timestamps.
- run:
name: build postgres
command: |
if [ ! -e tmp_install/bin/postgres ]; then
# "depth 1" saves some time by not cloning the whole repo
git submodule update --init --depth 1
# bail out on any warnings
COPT='-Werror' mold -run make postgres -j$(nproc)
fi
- save_cache:
name: Save postgres cache
key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
paths:
- tmp_install
# A job to build Neon rust code
build-neon:
executor: neon-xlarge-executor
parameters:
build_type:
type: enum
enum: ["debug", "release"]
environment:
BUILD_TYPE: << parameters.build_type >>
steps:
# Checkout the git repo (without submodules)
- checkout
# Grab the postgres git revision to build a cache key.
# Append makefile as it could change the way postgres is built.
# Note this works even though the submodule hasn't been checkout out yet.
- run:
name: Get postgres cache key
command: |
git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
cat Makefile >> /tmp/cache-key-postgres
- restore_cache:
name: Restore postgres cache
keys:
# Restore ONLY if the rev key matches exactly
- v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
- restore_cache:
name: Restore rust cache
keys:
# Require an exact match. While an out of date cache might speed up the build,
# there's no way to clean out old packages, so the cache grows every time something
# changes.
- v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
# Build the rust code, including test binaries
- run:
name: Rust build << parameters.build_type >>
command: |
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
CARGO_FLAGS=
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=()
CARGO_FLAGS="--release --features profiling"
fi
export CARGO_INCREMENTAL=0
export CACHEPOT_BUCKET=zenith-rust-cachepot
export RUSTC_WRAPPER=cachepot
export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
"${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --bins --tests
cachepot -s
- save_cache:
name: Save rust cache
key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
paths:
- ~/.cargo/registry
- ~/.cargo/git
- target
# Run rust unit tests
- run:
name: cargo test
command: |
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
CARGO_FLAGS=
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=()
CARGO_FLAGS=--release
fi
"${cov_prefix[@]}" cargo test $CARGO_FLAGS
# Install the rust binaries, for use by test jobs
- run:
name: Install rust binaries
command: |
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=()
fi
binaries=$(
"${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
test_exe_paths=$(
"${cov_prefix[@]}" cargo test --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
mkdir -p /tmp/zenith/bin
mkdir -p /tmp/zenith/test_bin
mkdir -p /tmp/zenith/etc
# Install target binaries
for bin in $binaries; do
SRC=target/$BUILD_TYPE/$bin
DST=/tmp/zenith/bin/$bin
cp $SRC $DST
echo $DST >> /tmp/zenith/etc/binaries.list
done
# Install test executables (for code coverage)
if [[ $BUILD_TYPE == "debug" ]]; then
for bin in $test_exe_paths; do
SRC=$bin
DST=/tmp/zenith/test_bin/$(basename $bin)
cp $SRC $DST
echo $DST >> /tmp/zenith/etc/binaries.list
done
fi
# Install the postgres binaries, for use by test jobs
- run:
name: Install postgres binaries
command: |
cp -a tmp_install /tmp/zenith/pg_install
- run:
name: Merge coverage data
command: |
# This will speed up workspace uploads
if [[ $BUILD_TYPE == "debug" ]]; then
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
fi
# Save the rust binaries and coverage data for other jobs in this workflow.
- persist_to_workspace:
root: /tmp/zenith
paths:
- "*"
check-codestyle-python:
executor: neon-executor
steps:
- checkout
- restore_cache:
keys:
- v2-python-deps-{{ checksum "poetry.lock" }}
- run:
name: Install deps
command: ./scripts/pysync
- save_cache:
key: v2-python-deps-{{ checksum "poetry.lock" }}
paths:
- /home/circleci/.cache/pypoetry/virtualenvs
- run:
name: Run yapf to ensure code format
when: always
command: poetry run yapf --recursive --diff .
- run:
name: Run mypy to check types
when: always
command: poetry run mypy .
run-pytest:
executor: neon-executor
parameters:
# pytest args to specify the tests to run.
#
# This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
# or '-k foobar' to run tests containing string 'foobar'. See pytest man page
# section SPECIFYING TESTS / SELECTING TESTS for details.
#
# Select the type of Rust build. Must be "release" or "debug".
build_type:
type: string
default: "debug"
# This parameter is required, to prevent the mistake of running all tests in one job.
test_selection:
type: string
default: ""
# Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
extra_params:
type: string
default: ""
needs_postgres_source:
type: boolean
default: false
run_in_parallel:
type: boolean
default: true
save_perf_report:
type: boolean
default: false
environment:
BUILD_TYPE: << parameters.build_type >>
steps:
- attach_workspace:
at: /tmp/zenith
- checkout
- when:
condition: << parameters.needs_postgres_source >>
steps:
- run: git submodule update --init --depth 1
- restore_cache:
keys:
- v2-python-deps-{{ checksum "poetry.lock" }}
- run:
name: Install deps
command: ./scripts/pysync
- save_cache:
key: v2-python-deps-{{ checksum "poetry.lock" }}
paths:
- /home/circleci/.cache/pypoetry/virtualenvs
- run:
name: Run pytest
# pytest doesn't output test logs in real time, so CI job may fail with
# `Too long with no output` error, if a test is running for a long time.
# In that case, tests should have internal timeouts that are less than
# no_output_timeout, specified here.
no_output_timeout: 10m
environment:
- ZENITH_BIN: /tmp/zenith/bin
- POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
- TEST_OUTPUT: /tmp/test_output
# this variable will be embedded in perf test report
# and is needed to distinguish different environments
- PLATFORM: zenith-local-ci
command: |
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
rm -rf $PERF_REPORT_DIR
TEST_SELECTION="test_runner/<< parameters.test_selection >>"
EXTRA_PARAMS="<< parameters.extra_params >>"
if [ -z "$TEST_SELECTION" ]; then
echo "test_selection must be set"
exit 1
fi
if << parameters.run_in_parallel >>; then
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
fi
if << parameters.save_perf_report >>; then
if [[ $CIRCLE_BRANCH == "main" ]]; then
mkdir -p "$PERF_REPORT_DIR"
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
fi
fi
export GITHUB_SHA=$CIRCLE_SHA1
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=()
fi
# Run the tests.
#
# The junit.xml file allows CircleCI to display more fine-grained test information
# in its "Tests" tab in the results page.
# --verbose prints name of each test (helpful when there are
# multiple tests in one file)
# -rA prints summary in the end
# -n4 uses four processes to run tests via pytest-xdist
# -s is not used to prevent pytest from capturing output, because tests are running
# in parallel and logs are mixed between different tests
"${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--tb=short \
--verbose \
-m "not remote_cluster" \
-rA $TEST_SELECTION $EXTRA_PARAMS
if << parameters.save_perf_report >>; then
if [[ $CIRCLE_BRANCH == "main" ]]; then
export REPORT_FROM="$PERF_REPORT_DIR"
export REPORT_TO=local
scripts/generate_and_push_perf_report.sh
fi
fi
- run:
# CircleCI artifacts are preserved one file at a time, so skipping
# this step isn't a good idea. If you want to extract the
# pageserver state, perhaps a tarball would be a better idea.
name: Delete all data but logs
when: always
command: |
du -sh /tmp/test_output/*
find /tmp/test_output -type f ! -name "pg.log" ! -name "pageserver.log" ! -name "safekeeper.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" -delete
du -sh /tmp/test_output/*
- store_artifacts:
path: /tmp/test_output
# The store_test_results step tells CircleCI where to find the junit.xml file.
- store_test_results:
path: /tmp/test_output
- run:
name: Merge coverage data
command: |
# This will speed up workspace uploads
if [[ $BUILD_TYPE == "debug" ]]; then
scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
fi
# Save coverage data (if any)
- persist_to_workspace:
root: /tmp/zenith
paths:
- "*"
coverage-report:
executor: neon-xlarge-executor
steps:
- attach_workspace:
at: /tmp/zenith
- checkout
- restore_cache:
name: Restore rust cache
keys:
# Require an exact match. While an out of date cache might speed up the build,
# there's no way to clean out old packages, so the cache grows every time something
# changes.
- v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
- run:
name: Build coverage report
command: |
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
scripts/coverage \
--dir=/tmp/zenith/coverage report \
--input-objects=/tmp/zenith/etc/binaries.list \
--commit-url=$COMMIT_URL \
--format=github
- run:
name: Upload coverage report
command: |
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
scripts/git-upload \
--repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
--message="Add code coverage for $COMMIT_URL" \
copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
# Add link to the coverage report to the commit
curl -f -X POST \
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-H "Accept: application/vnd.github.v3+json" \
--user "$CI_ACCESS_TOKEN" \
--data \
"{
\"state\": \"success\",
\"context\": \"zenith-coverage\",
\"description\": \"Coverage report is ready\",
\"target_url\": \"$REPORT_URL\"
}"
# Build neondatabase/neon:latest image and push it to Docker hub
docker-image:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG=$(git log --oneline|wc -l)
docker build \
--pull \
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
docker push neondatabase/neon:${DOCKER_TAG}
docker push neondatabase/neon:latest
# Build neondatabase/compute-node:latest image and push it to Docker hub
docker-image-compute:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
# Build neondatabase/compute-tools:latest image and push it to Docker hub
# TODO: this should probably also use versioned tag, not just :latest.
# XXX: but should it? We build and use it only locally now.
- run:
name: Build and push compute-tools Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
docker build \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools .
docker push neondatabase/compute-tools:latest
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push compute-node Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG=$(git log --oneline|wc -l)
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres
docker push neondatabase/compute-node:${DOCKER_TAG}
docker push neondatabase/compute-node:latest
# Build production neondatabase/neon:release image and push it to Docker hub
docker-image-release:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG="release-$(git log --oneline|wc -l)"
docker build \
--pull \
--build-arg GIT_VERSION=${CIRCLE_SHA1} \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
docker push neondatabase/neon:${DOCKER_TAG}
docker push neondatabase/neon:release
# Build production neondatabase/compute-node:release image and push it to Docker hub
docker-image-compute-release:
docker:
- image: cimg/base:2021.04
steps:
- checkout
- setup_remote_docker:
docker_layer_caching: true
# Build neondatabase/compute-tools:release image and push it to Docker hub
# TODO: this should probably also use versioned tag, not just :latest.
# XXX: but should it? We build and use it only locally now.
- run:
name: Build and push compute-tools Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
docker build \
--build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
--build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
--tag neondatabase/compute-tools:release -f Dockerfile.compute-tools .
docker push neondatabase/compute-tools:release
- run:
name: Init postgres submodule
command: git submodule update --init --depth 1
- run:
name: Build and push compute-node Docker image
command: |
echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
DOCKER_TAG="release-$(git log --oneline|wc -l)"
docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres
docker push neondatabase/compute-node:${DOCKER_TAG}
docker push neondatabase/compute-node:release
deploy-staging:
docker:
- image: cimg/python:3.10
steps:
- checkout
- setup_remote_docker
- run:
name: Setup ansible
command: |
pip install --progress-bar off --user ansible boto3
- run:
name: Redeploy
command: |
cd "$(pwd)/.circleci/ansible"
./get_binaries.sh
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-playbook deploy.yaml -i staging.hosts
rm -f neon_install.tar.gz .neon_current_version
deploy-staging-proxy:
docker:
- image: cimg/base:2021.04
environment:
KUBECONFIG: .kubeconfig
steps:
- checkout
- run:
name: Store kubeconfig file
command: |
echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- run:
name: Setup helm v3
command: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add zenithdb https://neondatabase.github.io/helm-charts
- run:
name: Re-deploy proxy
command: |
DOCKER_TAG=$(git log --oneline|wc -l)
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
deploy-release:
docker:
- image: cimg/python:3.10
steps:
- checkout
- setup_remote_docker
- run:
name: Setup ansible
command: |
pip install --progress-bar off --user ansible boto3
- run:
name: Redeploy
command: |
cd "$(pwd)/.circleci/ansible"
RELEASE=true ./get_binaries.sh
echo "${TELEPORT_SSH_KEY}" | tr -d '\n'| base64 --decode >ssh-key
echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-playbook deploy.yaml -i production.hosts
rm -f neon_install.tar.gz .neon_current_version
deploy-release-proxy:
docker:
- image: cimg/base:2021.04
environment:
KUBECONFIG: .kubeconfig
steps:
- checkout
- run:
name: Store kubeconfig file
command: |
echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- run:
name: Setup helm v3
command: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add zenithdb https://neondatabase.github.io/helm-charts
- run:
name: Re-deploy proxy
command: |
DOCKER_TAG="release-$(git log --oneline|wc -l)"
helm upgrade zenith-proxy zenithdb/zenith-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
# Trigger a new remote CI job
remote-ci-trigger:
docker:
- image: cimg/base:2021.04
parameters:
remote_repo:
type: string
environment:
REMOTE_REPO: << parameters.remote_repo >>
steps:
- run:
name: Set PR's status to pending
command: |
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
curl -f -X POST \
https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-H "Accept: application/vnd.github.v3+json" \
--user "$CI_ACCESS_TOKEN" \
--data \
"{
\"state\": \"pending\",
\"context\": \"neon-cloud-e2e\",
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
}"
- run:
name: Request a remote CI test
command: |
LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "$CI_ACCESS_TOKEN" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$CIRCLE_SHA1\",
\"remote_repo\": \"$LOCAL_REPO\"
}
}"
workflows:
build_and_test:
jobs:
- check-codestyle-rust
- check-codestyle-python
- build-postgres:
name: build-postgres-<< matrix.build_type >>
matrix:
parameters:
build_type: ["debug", "release"]
- build-neon:
name: build-neon-<< matrix.build_type >>
matrix:
parameters:
build_type: ["debug", "release"]
requires:
- build-postgres-<< matrix.build_type >>
- run-pytest:
name: pg_regress-tests-<< matrix.build_type >>
context: PERF_TEST_RESULT_CONNSTR
matrix:
parameters:
build_type: ["debug", "release"]
test_selection: batch_pg_regress
needs_postgres_source: true
requires:
- build-neon-<< matrix.build_type >>
- run-pytest:
name: other-tests-<< matrix.build_type >>
matrix:
parameters:
build_type: ["debug", "release"]
test_selection: batch_others
requires:
- build-neon-<< matrix.build_type >>
- run-pytest:
name: benchmarks
context: PERF_TEST_RESULT_CONNSTR
build_type: release
test_selection: performance
run_in_parallel: false
save_perf_report: true
requires:
- build-neon-release
- coverage-report:
# Context passes credentials for gh api
context: CI_ACCESS_TOKEN
requires:
# TODO: consider adding more
- other-tests-debug
- docker-image:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- main
requires:
- pg_regress-tests-release
- other-tests-release
- docker-image-compute:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- main
requires:
- pg_regress-tests-release
- other-tests-release
- deploy-staging:
# Context gives an ability to login
context: Docker Hub
# deploy only for commits to main
filters:
branches:
only:
- main
requires:
- docker-image
- deploy-staging-proxy:
# deploy only for commits to main
filters:
branches:
only:
- main
requires:
- docker-image
- docker-image-release:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- release
requires:
- pg_regress-tests-release
- other-tests-release
- docker-image-compute-release:
# Context gives an ability to login
context: Docker Hub
# Build image only for commits to main
filters:
branches:
only:
- release
requires:
- pg_regress-tests-release
- other-tests-release
- deploy-release:
# Context gives an ability to login
context: Docker Hub
# deploy only for commits to main
filters:
branches:
only:
- release
requires:
- docker-image-release
- deploy-release-proxy:
# deploy only for commits to main
filters:
branches:
only:
- release
requires:
- docker-image-release
- remote-ci-trigger:
# Context passes credentials for gh api
context: CI_ACCESS_TOKEN
remote_repo: "neondatabase/cloud"
requires:
# XXX: Successful build doesn't mean everything is OK, but
# the job to be triggered takes so much time to complete (~22 min)
# that it's better not to wait for the commented-out steps
- build-neon-release
# - pg_regress-tests-release
# - other-tests-release

View File

@@ -1,3 +1,9 @@
# Helm chart values for zenith-proxy.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
uri: "https://console.neon.tech/psql_session/"
@@ -22,7 +28,7 @@ exposedService:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
external-dns.alpha.kubernetes.io/hostname: start.zenith.tech,connect.neon.tech,pg.neon.tech
metrics:
enabled: true

View File

@@ -9,8 +9,8 @@ tmp_install
tmp_check_cli
test_output
.vscode
.neon
integration_tests/.neon
.zenith
integration_tests/.zenith
.mypy_cache
Dockerfile

View File

@@ -1,56 +0,0 @@
name: "Download an artifact"
description: "Custom download action"
inputs:
name:
description: "Artifact name"
required: true
path:
description: "A directory to put artifact into"
default: "."
required: false
skip-if-does-not-exist:
description: "Allow to skip if file doesn't exist, fail otherwise"
default: false
required: false
runs:
using: "composite"
steps:
- name: Download artifact
id: download-artifact
shell: bash -euxo pipefail {0}
env:
TARGET: ${{ inputs.path }}
ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
run: |
BUCKET=neon-github-public-dev
PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=$(basename $ARCHIVE)
S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
if [ -z "${S3_KEY}" ]; then
if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
echo '::set-output name=SKIPPED::true'
exit 0
else
echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist"
exit 1
fi
fi
echo '::set-output name=SKIPPED::false'
mkdir -p $(dirname $ARCHIVE)
time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE}
- name: Extract artifact
if: ${{ steps.download-artifact.outputs.SKIPPED == 'false' }}
shell: bash -euxo pipefail {0}
env:
TARGET: ${{ inputs.path }}
ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
run: |
mkdir -p ${TARGET}
time tar -xf ${ARCHIVE} -C ${TARGET}
rm -f ${ARCHIVE}

View File

@@ -1,162 +0,0 @@
name: 'Run python test'
description: 'Runs a Neon python test set, performing all the required preparations before'
inputs:
build_type:
description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
required: true
rust_toolchain:
description: 'Rust toolchain version to fetch the caches'
required: true
test_selection:
description: 'A python test suite to run'
required: true
extra_params:
description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
required: false
default: ''
needs_postgres_source:
description: 'Set to true if the test suite requires postgres source checked out'
required: false
default: 'false'
run_in_parallel:
description: 'Whether to run tests in parallel'
required: false
default: 'true'
save_perf_report:
description: 'Whether to upload the performance report'
required: false
default: 'false'
run_with_real_s3:
description: 'Whether to pass real s3 credentials to the test suite'
required: false
default: 'false'
real_s3_bucket:
description: 'Bucket name for real s3 tests'
required: false
default: ''
real_s3_region:
description: 'Region name for real s3 tests'
required: false
default: ''
real_s3_access_key_id:
description: 'Access key id'
required: false
default: ''
real_s3_secret_access_key:
description: 'Secret access key'
required: false
default: ''
runs:
using: "composite"
steps:
- name: Get Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
path: /tmp/neon
- name: Checkout
if: inputs.needs_postgres_source == 'true'
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
shell: bash -euxo pipefail {0}
run: ./scripts/pysync
- name: Run pytest
env:
NEON_BIN: /tmp/neon/bin
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
TEST_OUTPUT: /tmp/test_output
# this variable will be embedded in perf test report
# and is needed to distinguish different environments
PLATFORM: github-actions-selfhosted
BUILD_TYPE: ${{ inputs.build_type }}
AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
shell: bash -euxo pipefail {0}
run: |
PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
rm -rf $PERF_REPORT_DIR
TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
EXTRA_PARAMS="${{ inputs.extra_params }}"
if [ -z "$TEST_SELECTION" ]; then
echo "test_selection must be set"
exit 1
fi
if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
fi
if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
echo "REAL S3 ENABLED"
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }}
export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }}
fi
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
mkdir -p "$PERF_REPORT_DIR"
EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
fi
fi
if [[ "${{ inputs.build_type }}" == "debug" ]]; then
cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
elif [[ "${{ inputs.build_type }}" == "release" ]]; then
cov_prefix=()
fi
# Run the tests.
#
# The junit.xml file allows CI tools to display more fine-grained test information
# in its "Tests" tab in the results page.
# --verbose prints name of each test (helpful when there are
# multiple tests in one file)
# -rA prints summary in the end
# -n4 uses four processes to run tests via pytest-xdist
# -s is not used to prevent pytest from capturing output, because tests are running
# in parallel and logs are mixed between different tests
"${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--tb=short \
--verbose \
-m "not remote_cluster" \
-rA $TEST_SELECTION $EXTRA_PARAMS
if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
export REPORT_FROM="$PERF_REPORT_DIR"
export REPORT_TO=local
scripts/generate_and_push_perf_report.sh
fi
fi
- name: Delete all data but logs
shell: bash -euxo pipefail {0}
if: always()
run: |
du -sh /tmp/test_output/*
find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
du -sh /tmp/test_output/*
- name: Upload python test logs
if: always()
uses: ./.github/actions/upload
with:
name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
path: /tmp/test_output/

View File

@@ -1,22 +0,0 @@
name: 'Merge and upload coverage data'
description: 'Compresses and uploads the coverage data as an artifact'
runs:
using: "composite"
steps:
- name: Merge coverage data
shell: bash -euxo pipefail {0}
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
- name: Download previous coverage data into the same directory
uses: ./.github/actions/download
with:
name: coverage-data-artifact
path: /tmp/coverage
skip-if-does-not-exist: true # skip if there's no previous coverage to download
- name: Upload coverage data
uses: ./.github/actions/upload
with:
name: coverage-data-artifact
path: /tmp/coverage

View File

@@ -1,51 +0,0 @@
name: "Upload an artifact"
description: "Custom upload action"
inputs:
name:
description: "Artifact name"
required: true
path:
description: "A directory or file to upload"
required: true
runs:
using: "composite"
steps:
- name: Prepare artifact
shell: bash -euxo pipefail {0}
env:
SOURCE: ${{ inputs.path }}
ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
run: |
mkdir -p $(dirname $ARCHIVE)
if [ -f ${ARCHIVE} ]; then
echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before"
exit 1
fi
ZSTD_NBTHREADS=0
if [ -d ${SOURCE} ]; then
time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd .
elif [ -f ${SOURCE} ]; then
time tar -cf ${ARCHIVE} --zstd ${SOURCE}
else
echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it"
fi
- name: Upload artifact
shell: bash -euxo pipefail {0}
env:
SOURCE: ${{ inputs.path }}
ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
run: |
BUCKET=neon-github-public-dev
PREFIX=artifacts/${GITHUB_RUN_ID}
FILENAME=$(basename $ARCHIVE)
FILESIZE=$(du -sh ${ARCHIVE} | cut -f1)
time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}
# Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary
echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY}

View File

@@ -1,20 +0,0 @@
[pageservers]
neon-stress-ps-1 console_region_id=1
neon-stress-ps-2 console_region_id=1
[safekeepers]
neon-stress-sk-1 console_region_id=1
neon-stress-sk-2 console_region_id=1
neon-stress-sk-3 console_region_id=1
[storage:children]
pageservers
safekeepers
[storage:vars]
env_name = neon-stress
console_mgmt_base_url = http://neon-stress-console.local
bucket_name = neon-storage-ireland
bucket_region = eu-west-1
etcd_endpoints = etcd-stress.local:2379
safekeeper_enable_s3_offload = false

View File

@@ -1,18 +0,0 @@
[Unit]
Description=Zenith safekeeper
After=network.target auditd.service
[Service]
Type=simple
User=safekeeper
Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
ExecReload=/bin/kill -HUP $MAINPID
KillMode=mixed
KillSignal=SIGINT
Restart=on-failure
TimeoutSec=10
LimitNOFILE=30000000
[Install]
WantedBy=multi-user.target

View File

@@ -1,26 +0,0 @@
fullnameOverride: "neon-stress-proxy-scram"
settings:
authBackend: "console"
authEndpoint: "http://neon-stress-console.local/management/api/v2"
domain: "*.stress.neon.tech"
podLabels:
zenith_service: proxy-scram
zenith_env: staging
zenith_region: eu-west-1
zenith_region_slug: ireland
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: '*.stress.neon.tech'
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -1,34 +0,0 @@
fullnameOverride: "neon-stress-proxy"
settings:
authEndpoint: "https://console.dev.neon.tech/authenticate_proxy_request/"
uri: "https://console.dev.neon.tech/psql_session/"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy
zenith_env: staging
zenith_region: eu-west-1
zenith_region_slug: ireland
service:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internal
external-dns.alpha.kubernetes.io/hostname: neon-stress-proxy.local
type: LoadBalancer
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: connect.dev.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -1,24 +0,0 @@
settings:
authBackend: "console"
authEndpoint: "http://console-release.local/management/api/v2"
domain: "*.cloud.neon.tech"
podLabels:
zenith_service: proxy-scram
zenith_env: production
zenith_region: us-west-2
zenith_region_slug: oregon
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: '*.cloud.neon.tech'
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -1,31 +0,0 @@
# Helm chart values for zenith-proxy.
# This is a YAML-formatted file.
image:
repository: neondatabase/neon
settings:
authBackend: "console"
authEndpoint: "http://console-staging.local/management/api/v2"
domain: "*.cloud.stage.neon.tech"
# -- Additional labels for zenith-proxy pods
podLabels:
zenith_service: proxy-scram
zenith_env: staging
zenith_region: us-east-1
zenith_region_slug: virginia
exposedService:
annotations:
service.beta.kubernetes.io/aws-load-balancer-type: external
service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
external-dns.alpha.kubernetes.io/hostname: cloud.stage.neon.tech
metrics:
enabled: true
serviceMonitor:
enabled: true
selector:
release: kube-prometheus-stack

View File

@@ -11,7 +11,7 @@ on:
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '36 4 * * *' # run once a day, timezone is utc
- cron: '36 7 * * *' # run once a day, timezone is utc
workflow_dispatch: # adds ability to run this manually
@@ -19,18 +19,18 @@ jobs:
bench:
# this workflow runs on self hosteed runner
# it's environment is quite different from usual guthub runner
# probably the most important difference is that it doesn't start from clean workspace each time
# probably the most important difference is that it doesnt start from clean workspace each time
# e g if you install system packages they are not cleaned up since you install them directly in host machine
# not a container or something
# See documentation for more info: https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners
runs-on: [self-hosted, zenith-benchmarker]
env:
POSTGRES_DISTRIB_DIR: "/usr/pgsql-14"
POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
steps:
- name: Checkout zenith repo
uses: actions/checkout@v3
uses: actions/checkout@v2
# actions/setup-python@v2 is not working correctly on self-hosted runners
# see https://github.com/actions/setup-python/issues/162
@@ -60,7 +60,7 @@ jobs:
- name: Setup cluster
env:
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
shell: bash -euxo pipefail {0}
shell: bash
run: |
set -e
@@ -88,7 +88,7 @@ jobs:
# Plus time needed to initialize the test databases.
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
PLATFORM: "neon-staging"
PLATFORM: "zenith-staging"
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
run: |
@@ -96,7 +96,7 @@ jobs:
# since it might generate duplicates when calling ingest_perf_test_result.py
rm -rf perf-report-staging
mkdir -p perf-report-staging
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600
./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
- name: Submit result
env:
@@ -104,12 +104,3 @@ jobs:
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
run: |
REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
- name: Post to a Slack channel
if: ${{ github.event.schedule && failure() }}
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

View File

@@ -1,620 +0,0 @@
name: Test and Deploy
on:
push:
branches:
- main
- release
pull_request:
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
jobs:
build-neon:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
rust_toolchain: [ 1.58 ]
env:
BUILD_TYPE: ${{ matrix.build_type }}
GIT_VERSION: ${{ github.sha }}
steps:
- name: Fix git ownerwhip
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Set pg revision for caching
id: pg_ver
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
# Set some environment variables used by all the steps.
#
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
# It also includes --features, if any
#
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
# because "cargo metadata" doesn't accept --release or --debug options
#
- name: Set env variables
run: |
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FEATURES=""
CARGO_FLAGS=""
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FEATURES="--features profiling"
CARGO_FLAGS="--release $CARGO_FEATURES"
fi
echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
# Don't include the ~/.cargo/registry/src directory. It contains just
# uncompressed versions of the crates in ~/.cargo/registry/cache
# directory, and it's faster to let 'cargo' to rebuild it from the
# compressed crates.
- name: Cache cargo deps
id: cache_cargo
uses: actions/cache@v3
with:
path: |
~/.cargo/registry/
!~/.cargo/registry/src
~/.cargo/git/
target/
# Fall back to older versions of the key, if no cache for current Cargo.lock was found
key: |
v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
- name: Cache postgres build
id: cache_pg
uses: actions/cache@v3
with:
path: tmp_install/
key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: mold -run make postgres -j$(nproc)
- name: Run cargo build
run: |
${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
- name: Run cargo test
run: |
${cov_prefix} cargo test $CARGO_FLAGS
- name: Install rust binaries
run: |
# Install target binaries
mkdir -p /tmp/neon/bin/
binaries=$(
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
for bin in $binaries; do
SRC=target/$BUILD_TYPE/$bin
DST=/tmp/neon/bin/$bin
cp "$SRC" "$DST"
done
# Install test executables and write list of all binaries (for code coverage)
if [[ $BUILD_TYPE == "debug" ]]; then
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
mkdir -p /tmp/neon/test_bin/
test_exe_paths=$(
${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
for bin in $test_exe_paths; do
SRC=$bin
DST=/tmp/neon/test_bin/$(basename $bin)
# We don't need debug symbols for code coverage, so strip them out to make
# the artifact smaller.
strip "$SRC" -o "$DST"
echo "$DST" >> /tmp/coverage/binaries.list
done
for bin in $binaries; do
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
fi
- name: Install postgres binaries
run: cp -a tmp_install /tmp/neon/pg_install
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
path: /tmp/neon
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
pg_regress-tests:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ build-neon ]
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
rust_toolchain: [ 1.58 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 2
- name: Pytest regress tests
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ matrix.build_type }}
rust_toolchain: ${{ matrix.rust_toolchain }}
test_selection: batch_pg_regress
needs_postgres_source: true
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
other-tests:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ build-neon ]
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
rust_toolchain: [ 1.58 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 2
- name: Pytest other tests
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ matrix.build_type }}
rust_toolchain: ${{ matrix.rust_toolchain }}
test_selection: batch_others
run_with_real_s3: true
real_s3_bucket: ci-tests-s3
real_s3_region: us-west-2
real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
benchmarks:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ build-neon ]
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
fail-fast: false
matrix:
build_type: [ release ]
rust_toolchain: [ 1.58 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 2
- name: Pytest benchmarks
uses: ./.github/actions/run-python-test-set
with:
build_type: ${{ matrix.build_type }}
rust_toolchain: ${{ matrix.rust_toolchain }}
test_selection: performance
run_in_parallel: false
save_perf_report: true
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
# XXX: no coverage data handling here, since benchmarks are run on release builds,
# while coverage is currently collected for the debug ones
coverage-report:
runs-on: dev
container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
needs: [ other-tests, pg_regress-tests ]
strategy:
fail-fast: false
matrix:
build_type: [ debug ]
rust_toolchain: [ 1.58 ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 1
- name: Restore cargo deps cache
id: cache_cargo
uses: actions/cache@v3
with:
path: |
~/.cargo/registry/
!~/.cargo/registry/src
~/.cargo/git/
target/
key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
- name: Get Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
path: /tmp/neon
- name: Get coverage artifact
uses: ./.github/actions/download
with:
name: coverage-data-artifact
path: /tmp/coverage
- name: Merge coverage data
run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
- name: Build and upload coverage report
run: |
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
scripts/coverage \
--dir=/tmp/coverage report \
--input-objects=/tmp/coverage/binaries.list \
--commit-url=$COMMIT_URL \
--format=github
REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
scripts/git-upload \
--repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
--message="Add code coverage for $COMMIT_URL" \
copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
# Add link to the coverage report to the commit
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"success\",
\"context\": \"neon-coverage\",
\"description\": \"Coverage report is ready\",
\"target_url\": \"$REPORT_URL\"
}"
trigger-e2e-tests:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ build-neon ]
steps:
- name: Set PR's status to pending and request a remote CI test
run: |
COMMIT_SHA=${{ github.event.pull_request.head.sha }}
COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
REMOTE_REPO="${{ github.repository_owner }}/cloud"
curl -f -X POST \
https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"state\": \"pending\",
\"context\": \"neon-cloud-e2e\",
\"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
}"
curl -f -X POST \
https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-H "Accept: application/vnd.github.v3+json" \
--user "${{ secrets.CI_ACCESS_TOKEN }}" \
--data \
"{
\"ref\": \"main\",
\"inputs\": {
\"ci_job_name\": \"neon-cloud-e2e\",
\"commit_hash\": \"$COMMIT_SHA\",
\"remote_repo\": \"${{ github.repository }}\"
}
}"
docker-image:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ pg_regress-tests, other-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
with:
driver: docker
- name: Get build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::$(git rev-list --count HEAD)"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: build-tag
- name: Get legacy build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::latest"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: legacy-build-tag
- name: Build neon Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
pull: true
push: true
tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
docker-image-compute:
runs-on: [ self-hosted, Linux, k8s-runner ]
needs: [ pg_regress-tests, other-tests ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
build-tag: ${{steps.build-tag.outputs.tag}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
with:
driver: docker
- name: Get build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::$(git rev-list --count HEAD)"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: build-tag
- name: Get legacy build tag
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
echo "::set-output name=tag::latest"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
echo "::set-output name=tag::release"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
id: legacy-build-tag
- name: Build compute-tools Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
push: false
file: Dockerfile.compute-tools
tags: neondatabase/compute-tools:local
- name: Push compute-tools Docker image
uses: docker/build-push-action@v2
with:
context: .
build-args: |
GIT_VERSION="${{github.sha}}"
AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
push: true
file: Dockerfile.compute-tools
tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
- name: Build compute-node Docker image
uses: docker/build-push-action@v2
with:
context: ./vendor/postgres/
build-args:
COMPUTE_TOOLS_TAG=local
push: true
tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
calculate-deploy-targets:
runs-on: [ self-hosted, Linux, k8s-runner ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
outputs:
matrix-include: ${{ steps.set-matrix.outputs.include }}
steps:
- id: set-matrix
run: |
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
echo "::set-output name=include::[$PRODUCTION]"
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
deploy:
runs-on: [ self-hosted, Linux, k8s-runner ]
# We need both storage **and** compute images for deploy, because control plane
# picks the compute version based on the storage version. If it notices a fresh
# storage it may bump the compute version. And if compute image failed to build
# it may break things badly.
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Setup ansible
run: |
pip install --progress-bar off --user ansible boto3
- name: Redeploy
run: |
cd "$(pwd)/.github/ansible"
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
./get_binaries.sh
elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
RELEASE=true ./get_binaries.sh
else
echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
exit 1
fi
eval $(ssh-agent)
echo "${{ secrets.TELEPORT_SSH_KEY }}" | tr -d '\n'| base64 --decode >ssh-key
echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
chmod 0600 ssh-key
ssh-add ssh-key
rm -f ssh-key ssh-key-cert.pub
ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
rm -f neon_install.tar.gz .neon_current_version
deploy-proxy:
runs-on: [ self-hosted, Linux, k8s-runner ]
# Compute image isn't strictly required for proxy deploy, but let's still wait for it
# to run all deploy jobs consistently.
needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
if: |
(github.ref_name == 'main' || github.ref_name == 'release') &&
github.event_name != 'workflow_dispatch'
strategy:
matrix:
include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
env:
KUBECONFIG: .kubeconfig
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: true
fetch-depth: 0
- name: Store kubeconfig file
run: |
echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
chmod 0600 ${KUBECONFIG}
- name: Setup helm v3
run: |
curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
helm repo add neondatabase https://neondatabase.github.io/helm-charts
- name: Re-deploy proxy
run: |
DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
helm upgrade ${{ matrix.proxy_job }} neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s

View File

@@ -1,135 +0,0 @@
name: Check code style and build
on:
push:
branches:
- main
pull_request:
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
env:
RUST_BACKTRACE: 1
jobs:
check-codestyle-rust:
strategy:
fail-fast: false
matrix:
# If we want to duplicate this job for different
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
rust_toolchain: [1.58]
os: [ubuntu-latest, macos-latest]
timeout-minutes: 60
name: run regression test suite
runs-on: ${{ matrix.os }}
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
fetch-depth: 2
- name: Install rust toolchain ${{ matrix.rust_toolchain }}
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: ${{ matrix.rust_toolchain }}
components: rustfmt, clippy
override: true
- name: Check formatting
run: cargo fmt --all -- --check
- name: Install Ubuntu postgres dependencies
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt update
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev
- name: Install macOS postgres dependencies
if: matrix.os == 'macos-latest'
run: brew install flex bison openssl
- name: Set pg revision for caching
id: pg_ver
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
- name: Cache postgres build
id: cache_pg
uses: actions/cache@v2
with:
path: |
tmp_install/
key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
- name: Set extra env for macOS
if: matrix.os == 'macos-latest'
run: |
echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: make postgres
# Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
# and the real cause will be inside config.log
- name: Print configure logs in case of failure
if: failure()
continue-on-error: true
run: |
echo '' && echo '=== config.log ===' && echo ''
cat tmp_install/build/config.log
echo '' && echo '=== configure.log ===' && echo ''
cat tmp_install/build/configure.log
- name: Cache cargo deps
id: cache_cargo
uses: actions/cache@v2
with:
path: |
~/.cargo/registry
!~/.cargo/registry/src
~/.cargo/git
target
key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}
- name: Run cargo clippy
run: ./run_clippy.sh
- name: Ensure all project builds
run: cargo build --all --all-targets
check-codestyle-python:
runs-on: [ self-hosted, Linux, k8s-runner ]
steps:
- name: Checkout
uses: actions/checkout@v3
with:
submodules: false
fetch-depth: 1
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
run: ./scripts/pysync
- name: Run yapf to ensure code format
run: poetry run yapf --recursive --diff .
- name: Run mypy to check types
run: poetry run mypy .

View File

@@ -1,72 +0,0 @@
name: Test Postgres client libraries
on:
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '23 02 * * *' # run once a day, timezone is utc
workflow_dispatch:
concurrency:
# Allow only one workflow per any non-`main` branch.
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
cancel-in-progress: true
jobs:
test-postgres-client-libs:
runs-on: [ ubuntu-latest ]
steps:
- name: Checkout
uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install Poetry
uses: snok/install-poetry@v1
- name: Cache poetry deps
id: cache_poetry
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
- name: Install Python deps
shell: bash -euxo pipefail {0}
run: ./scripts/pysync
- name: Run pytest
env:
REMOTE_ENV: 1
BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
TEST_OUTPUT: /tmp/test_output
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
shell: bash -euxo pipefail {0}
run: |
# Test framework expects we have psql binary;
# but since we don't really need it in this test, let's mock it
mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--tb=short \
--verbose \
-m "remote_cluster" \
-rA "test_runner/pg_clients"
- name: Post to a Slack channel
if: failure()
id: slack
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

71
.github/workflows/testing.yml vendored Normal file
View File

@@ -0,0 +1,71 @@
name: Build and Test
on: push
jobs:
regression-check:
strategy:
matrix:
# If we want to duplicate this job for different
# Rust toolchains (e.g. nightly or 1.37.0), add them here.
rust_toolchain: [stable]
os: [ubuntu-latest, macos-latest]
timeout-minutes: 30
name: run regression test suite
runs-on: ${{ matrix.os }}
steps:
- name: Checkout
uses: actions/checkout@v2
with:
submodules: true
fetch-depth: 2
- name: install rust toolchain ${{ matrix.rust_toolchain }}
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: ${{ matrix.rust_toolchain }}
override: true
- name: Install Ubuntu postgres dependencies
if: matrix.os == 'ubuntu-latest'
run: |
sudo apt update
sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
- name: Install macOs postgres dependencies
if: matrix.os == 'macos-latest'
run: brew install flex bison
- name: Set pg revision for caching
id: pg_ver
run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
- name: Cache postgres build
id: cache_pg
uses: actions/cache@v2
with:
path: |
tmp_install/
key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
- name: Build postgres
if: steps.cache_pg.outputs.cache-hit != 'true'
run: make postgres
- name: Cache cargo deps
id: cache_cargo
uses: actions/cache@v2
with:
path: |
~/.cargo/registry
~/.cargo/git
target
key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}
- name: Run cargo clippy
run: ./run_clippy.sh
- name: Run cargo test
run: cargo test --all --all-targets

8
.gitignore vendored
View File

@@ -5,13 +5,9 @@
__pycache__/
test_output/
.vscode
.idea
/.neon
/integration_tests/.neon
/.zenith
/integration_tests/.zenith
# Coverage
*.profraw
*.profdata
*.key
*.crt

View File

@@ -6,5 +6,5 @@ target/
tmp_install/
__pycache__/
test_output/
.neon/
.zenith/
.git/

View File

@@ -11,15 +11,17 @@ than it was before.
## Submitting changes
1. Get at least one +1 on your PR before you push.
1. Make a PR for every change.
Even seemingly trivial patches can break things in surprising ways.
Use of common sense is OK. If you're only fixing a typo in a comment,
it's probably fine to just push it. But if in doubt, open a PR.
2. Get at least one +1 on your PR before you push.
For simple patches, it will only take a minute for someone to review
it.
2. Don't force push small changes after making the PR ready for review.
Doing so will force readers to re-read your entire PR, which will delay
the review process.
3. Always keep the CI green.
Do not push, if the CI failed on your PR. Even if you think it's not

20
COPYRIGHT Normal file
View File

@@ -0,0 +1,20 @@
This software is licensed under the Apache 2.0 License:
----------------------------------------------------------------------------
Copyright 2021 Zenith Labs, Inc
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
----------------------------------------------------------------------------
The PostgreSQL submodule in vendor/postgres is licensed under the
PostgreSQL license. See vendor/postgres/COPYRIGHT.

1246
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -6,7 +6,7 @@ members = [
"proxy",
"safekeeper",
"workspace_hack",
"neon_local",
"zenith",
"libs/*",
]

View File

@@ -1,5 +1,5 @@
# Build Postgres
FROM neondatabase/rust:1.58 AS pg-build
FROM zimg/rust:1.58 AS pg-build
WORKDIR /pg
USER root
@@ -14,13 +14,9 @@ RUN set -e \
&& tar -C tmp_install -czf /postgres_install.tar.gz .
# Build zenith binaries
FROM neondatabase/rust:1.58 AS build
FROM zimg/rust:1.58 AS build
ARG GIT_VERSION=local
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
ARG RUSTC_WRAPPER=cachepot
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
ARG AWS_ACCESS_KEY_ID
ARG AWS_SECRET_ACCESS_KEY
@@ -29,7 +25,7 @@ COPY --from=pg-build /pg/tmp_install/include/postgresql/server tmp_install/inclu
COPY . .
# Show build caching stats to check if it was used in the end.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
# Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, loosing the compilation stats.
RUN set -e \
&& sudo -E "PATH=$PATH" mold -run cargo build --release \
&& cachepot -s
@@ -50,9 +46,9 @@ RUN set -e \
&& useradd -d /data zenith \
&& chown -R zenith:zenith /data
COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy /usr/local/bin
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy /usr/local/bin
COPY --from=pg-build /pg/tmp_install/ /usr/local/
COPY --from=pg-build /postgres_install.tar.gz /data/

95
Dockerfile.alpine Normal file
View File

@@ -0,0 +1,95 @@
#
# Docker image for console integration testing.
#
# We may also reuse it in CI to unify installation process and as a general binaries building
# tool for production servers.
#
# Dynamic linking is used for librocksdb and libstdc++ bacause librocksdb-sys calls
# bindgen with "dynamic" feature flag. This also prevents usage of dockerhub alpine-rust
# images which are statically linked and have guards against any dlopen. I would rather
# prefer all static binaries so we may change the way librocksdb-sys builds or wait until
# we will have our own storage and drop rockdb dependency.
#
# Cargo-chef is used to separate dependencies building from main binaries building. This
# way `docker build` will download and install dependencies only of there are changes to
# out Cargo.toml files.
#
#
# build postgres separately -- this layer will be rebuilt only if one of
# mentioned paths will get any changes
#
FROM alpine:3.13 as pg-build
RUN apk add --update clang llvm compiler-rt compiler-rt-static lld musl-dev binutils \
make bison flex readline-dev zlib-dev perl linux-headers libseccomp-dev
WORKDIR zenith
COPY ./vendor/postgres vendor/postgres
COPY ./Makefile Makefile
# Build using clang and lld
RUN CC='clang' LD='lld' CFLAGS='-fuse-ld=lld --rtlib=compiler-rt' make postgres -j4
#
# Calculate cargo dependencies.
# This will always run, but only generate recipe.json with list of dependencies without
# installing them.
#
FROM alpine:20210212 as cargo-deps-inspect
RUN apk add --update rust cargo
RUN cargo install cargo-chef
WORKDIR zenith
COPY . .
RUN cargo chef prepare --recipe-path recipe.json
#
# Build cargo dependencies.
# This temp cantainner would be build only if recipe.json was changed.
#
FROM alpine:20210212 as deps-build
RUN apk add --update rust cargo openssl-dev clang build-base
# rust-rocksdb can be built against system-wide rocksdb -- that saves about
# 10 minutes during build. Rocksdb apk package is in testing now, but use it
# anyway. In case of any troubles we can download and build rocksdb here manually
# (to cache it as a docker layer).
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
WORKDIR zenith
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY --from=cargo-deps-inspect /root/.cargo/bin/cargo-chef /root/.cargo/bin/
COPY --from=cargo-deps-inspect /zenith/recipe.json recipe.json
RUN ROCKSDB_LIB_DIR=/usr/lib/ cargo chef cook --release --recipe-path recipe.json
#
# Build zenith binaries
#
FROM alpine:20210212 as build
RUN apk add --update rust cargo openssl-dev clang build-base
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb-dev
WORKDIR zenith
COPY . .
# Copy cached dependencies
COPY --from=pg-build /zenith/tmp_install/include/postgresql/server tmp_install/include/postgresql/server
COPY --from=deps-build /zenith/target target
COPY --from=deps-build /root/.cargo /root/.cargo
RUN cargo build --release
#
# Copy binaries to resulting image.
# build-base hare to provide libstdc++ (it will also bring gcc, but leave it this way until we figure
# out how to statically link rocksdb or avoid it at all).
#
FROM alpine:3.13
RUN apk add --update openssl build-base libseccomp-dev
RUN apk --no-cache --update --repository https://dl-cdn.alpinelinux.org/alpine/edge/testing add rocksdb
COPY --from=build /zenith/target/release/pageserver /usr/local/bin
COPY --from=build /zenith/target/release/safekeeper /usr/local/bin
COPY --from=build /zenith/target/release/proxy /usr/local/bin
COPY --from=pg-build /zenith/tmp_install /usr/local
COPY docker-entrypoint.sh /docker-entrypoint.sh
RUN addgroup zenith && adduser -h /data -D -G zenith zenith
VOLUME ["/data"]
WORKDIR /data
USER zenith
EXPOSE 6400
ENTRYPOINT ["/docker-entrypoint.sh"]
CMD ["pageserver"]

View File

@@ -1,11 +1,7 @@
# First transient image to build compute_tools binaries
# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
FROM neondatabase/rust:1.58 AS rust-build
# NB: keep in sync with rust image version in .circle/config.yml
FROM zimg/rust:1.58 AS rust-build
# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
ARG RUSTC_WRAPPER=cachepot
ARG CACHEPOT_BUCKET=zenith-rust-cachepot
ARG AWS_ACCESS_KEY_ID
ARG AWS_SECRET_ACCESS_KEY
@@ -19,4 +15,4 @@ RUN set -e \
# Final image that only has one binary
FROM debian:buster-slim
COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
COPY --from=rust-build /home/circleci/project/target/release/zenith_ctl /usr/local/bin/zenith_ctl

View File

@@ -1,8 +1,3 @@
ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
# Where to install Postgres, default is ./tmp_install, maybe useful for package managers
POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install
# Seccomp BPF is only available for Linux
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Linux)
@@ -17,23 +12,15 @@ endif
#
BUILD_TYPE ?= debug
ifeq ($(BUILD_TYPE),release)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl
PG_CONFIGURE_OPTS = --enable-debug
PG_CFLAGS = -O2 -g3 $(CFLAGS)
# Unfortunately, `--profile=...` is a nightly feature
CARGO_BUILD_FLAGS += --release
else ifeq ($(BUILD_TYPE),debug)
PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
PG_CONFIGURE_OPTS = --enable-debug --enable-cassert --enable-depend
PG_CFLAGS = -O0 -g3 $(CFLAGS)
else
$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
endif
# macOS with brew-installed openssl requires explicit paths
# It can be configured with OPENSSL_PREFIX variable
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
$(error Bad build type `$(BUILD_TYPE)', see Makefile for options)
endif
# Choose whether we should be silent or verbose
@@ -62,55 +49,55 @@ zenith: postgres-headers
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
### PostgreSQL parts
$(POSTGRES_INSTALL_DIR)/build/config.status:
tmp_install/build/config.status:
+@echo "Configuring postgres build"
mkdir -p $(POSTGRES_INSTALL_DIR)/build
(cd $(POSTGRES_INSTALL_DIR)/build && \
$(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
mkdir -p tmp_install/build
(cd tmp_install/build && \
../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
$(PG_CONFIGURE_OPTS) \
$(SECCOMP) \
--prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log)
--prefix=$(abspath tmp_install) > configure.log)
# nicer alias for running 'configure'
.PHONY: postgres-configure
postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status
postgres-configure: tmp_install/build/config.status
# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include
# Install the PostgreSQL header files into tmp_install/include
.PHONY: postgres-headers
postgres-headers: postgres-configure
+@echo "Installing PostgreSQL headers"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install
$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
# Compile and install PostgreSQL and contrib/neon
# Compile and install PostgreSQL and contrib/zenith
.PHONY: postgres
postgres: postgres-configure \
postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
+@echo "Compiling PostgreSQL"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
+@echo "Compiling contrib/neon"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
+@echo "Compiling contrib/neon_test_utils"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
$(MAKE) -C tmp_install/build MAKELEVEL=0 install
+@echo "Compiling contrib/zenith"
$(MAKE) -C tmp_install/build/contrib/zenith install
+@echo "Compiling contrib/zenith_test_utils"
$(MAKE) -C tmp_install/build/contrib/zenith_test_utils install
+@echo "Compiling pg_buffercache"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
+@echo "Compiling pageinspect"
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install
$(MAKE) -C tmp_install/build/contrib/pageinspect install
.PHONY: postgres-clean
postgres-clean:
$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean
$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
# This doesn't remove the effects of 'configure'.
.PHONY: clean
clean:
cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
cd tmp_install/build && $(MAKE) clean
$(CARGO_CMD_PREFIX) cargo clean
# This removes everything
.PHONY: distclean
distclean:
rm -rf $(POSTGRES_INSTALL_DIR)
rm -rf tmp_install
$(CARGO_CMD_PREFIX) cargo clean
.PHONY: fmt
@@ -119,4 +106,4 @@ fmt:
.PHONY: setup-pre-commit-hook
setup-pre-commit-hook:
ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
ln -s -f ../../pre-commit.py .git/hooks/pre-commit

5
NOTICE
View File

@@ -1,5 +0,0 @@
Neon
Copyright 2022 Neon Inc.
The PostgreSQL submodule in vendor/postgres is licensed under the
PostgreSQL license. See vendor/postgres/COPYRIGHT.

194
README.md
View File

@@ -1,146 +1,85 @@
# Neon
Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.
Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
The project used to be called "Zenith". Many of the commands and code comments
still refer to "zenith", but we are in the process of renaming things.
## Quick start
[Join the waitlist](https://neon.tech/) for our free tier to receive your serverless postgres instance. Then connect to it with your preferred postgres client (psql, dbeaver, etc) or use the online SQL editor.
Alternatively, compile and run the project [locally](#running-local-installation).
## Architecture overview
A Neon installation consists of compute nodes and a Neon storage engine.
A Neon installation consists of compute nodes and Neon storage engine.
Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.
Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine.
The Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for the compute nodes.
- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.
Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for compute nodes.
- WAL service. The service that receives WAL from compute node and ensures that it is stored durably.
Pageserver consists of:
- Repository - Neon storage implementation.
- WAL receiver - service that receives WAL from WAL service and stores it in the repository.
- Page service - service that communicates with compute nodes and responds with pages from the repository.
- WAL redo - service that builds pages from base images and WAL records on Page service request
- WAL redo - service that builds pages from base images and WAL records on Page service request.
## Running local installation
1. Install build dependencies and other useful packages
#### Installing dependencies on Linux
1. Install build dependencies and other applicable packages
* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
```bash
On Ubuntu or Debian this set of packages should be sufficient to build the code:
```text
apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client
```
* On Fedora, these packages are needed:
```bash
dnf install flex bison readline-devel zlib-devel openssl-devel \
libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib
libssl-dev clang pkg-config libpq-dev
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
```
# recommended approach from https://www.rust-lang.org/tools/install
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
```
[Rust] 1.58 or later is also required.
#### Installing dependencies on OSX (12.3.1)
1. Install XCode and dependencies
```
xcode-select --install
brew install protobuf etcd openssl
```
2. [Install Rust](https://www.rust-lang.org/tools/install)
```
# recommended approach from https://www.rust-lang.org/tools/install
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
```
3. Install PostgreSQL Client
```
# from https://stackoverflow.com/questions/44654216/correct-way-to-install-psql-without-full-postgres-on-macos
brew install libpq
brew link --force libpq
```
#### Building on Linux
1. Build neon and patched postgres
```
# Note: The path to the neon sources can not contain a space.
git clone --recursive https://github.com/neondatabase/neon.git
cd neon
# The preferred and default is to make a debug build. This will create a
# demonstrably slower build than a release build. If you want to use a release
# build, utilize "BUILD_TYPE=release make -j`nproc`"
make -j`nproc`
```
#### Building on OSX
1. Build neon and patched postgres
```
# Note: The path to the neon sources can not contain a space.
git clone --recursive https://github.com/neondatabase/neon.git
cd neon
# The preferred and default is to make a debug build. This will create a
# demonstrably slower build than a release build. If you want to use a release
# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
make -j`sysctl -n hw.logicalcpu`
```
#### Dependency installation notes
To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.
To run the integration tests or Python scripts (not required to use the code), install
Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory.
Python (3.7 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
#### Running neon database
1. Start pageserver and postgres on top of it (should be called from repo root):
2. Build neon and patched postgres
```sh
# Create repository in .neon with proper paths to binaries and data
git clone --recursive https://github.com/neondatabase/neon.git
cd neon
make -j5
```
3. Start pageserver and postgres on top of it (should be called from repo root):
```sh
# Create repository in .zenith with proper paths to binaries and data
# Later that would be responsibility of a package install script
> ./target/debug/neon_local init
initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
created initial timeline de200bd42b49cc1814412c7e592dd6e9 timeline.lsn 0/16B5A50
initial timeline de200bd42b49cc1814412c7e592dd6e9 created
> ./target/debug/zenith init
initializing tenantid c03ba6b7ad4c5e9cf556f059ade44229
created initial timeline 5b014a9e41b4b63ce1a1febc04503636 timeline.lsn 0/169C3C8
created main branch
pageserver init succeeded
# start pageserver and safekeeper
> ./target/debug/neon_local start
Starting pageserver at '127.0.0.1:64000' in '.neon'
> ./target/debug/zenith start
Starting pageserver at 'localhost:64000' in '.zenith'
Pageserver started
initializing for sk 1 for 7676
Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
initializing for single for 7676
Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/single'
Safekeeper started
# start postgres compute node
> ./target/debug/neon_local pg start main
Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'
> ./target/debug/zenith pg start main
Starting new postgres main on timeline 5b014a9e41b4b63ce1a1febc04503636 ...
Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/c03ba6b7ad4c5e9cf556f059ade44229/main port=55432
Starting postgres node at 'host=127.0.0.1 port=55432 user=zenith_admin dbname=postgres'
waiting for server to start.... done
server started
# check list of running postgres instances
> ./target/debug/neon_local pg list
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16B5BA8 running
> ./target/debug/zenith pg list
NODE ADDRESS TIMELINES BRANCH NAME LSN STATUS
main 127.0.0.1:55432 5b014a9e41b4b63ce1a1febc04503636 main 0/1609610 running
```
2. Now, it is possible to connect to postgres and run some queries:
4. Now it is possible to connect to postgres and run some queries:
```text
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
> psql -p55432 -h 127.0.0.1 -U zenith_admin postgres
postgres=# CREATE TABLE t(key int primary key, value text);
CREATE TABLE
postgres=# insert into t values(1,1);
@@ -152,32 +91,25 @@ postgres=# select * from t;
(1 row)
```
3. And create branches and run postgres on them:
5. And create branches and run postgres on them:
```sh
# create branch named migration_check
> ./target/debug/neon_local timeline branch --branch-name migration_check
Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c. Ancestor timeline: 'main'
> ./target/debug/zenith timeline branch --branch-name migration_check
Created timeline '0e9331cad6efbafe6a88dd73ae21a5c9' at Lsn 0/16F5830 for tenant: c03ba6b7ad4c5e9cf556f059ade44229. Ancestor timeline: 'main'
# check branches tree
> ./target/debug/neon_local timeline list
(L) main [de200bd42b49cc1814412c7e592dd6e9]
(L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
> ./target/debug/zenith timeline list
main [5b014a9e41b4b63ce1a1febc04503636]
┗━ @0/1609610: migration_check [0e9331cad6efbafe6a88dd73ae21a5c9]
# start postgres on that branch
> ./target/debug/neon_local pg start migration_check --branch-name migration_check
Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'
# check the new list of running postgres instances
> ./target/debug/neon_local pg list
NODE ADDRESS TIMELINE BRANCH NAME LSN STATUS
main 127.0.0.1:55432 de200bd42b49cc1814412c7e592dd6e9 main 0/16F9A38 running
migration_check 127.0.0.1:55433 b3b863fa45fa9e57e615f9f2d944e601 migration_check 0/16F9A70 running
> ./target/debug/zenith pg start migration_check
Starting postgres node at 'host=127.0.0.1 port=55433 user=stas'
waiting for server to start.... done
# this new postgres instance will have all the data from 'main' postgres,
# but all modifications would not affect data in original postgres
> psql -p55433 -h 127.0.0.1 -U cloud_admin postgres
> psql -p55433 -h 127.0.0.1 -U zenith_admin postgres
postgres=# select * from t;
key | value
-----+-------
@@ -186,26 +118,16 @@ postgres=# select * from t;
postgres=# insert into t values(2,2);
INSERT 0 1
# check that the new change doesn't affect the 'main' postgres
> psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
postgres=# select * from t;
key | value
-----+-------
1 | 1
(1 row)
```
4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
you have just started. You can terminate them all with one command:
6. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
you have just started. You can stop them all with one command:
```sh
> ./target/debug/neon_local stop
> ./target/debug/zenith stop
```
## Running tests
Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
```sh
git clone --recursive https://github.com/neondatabase/neon.git
make # builds also postgres and installs it to ./tmp_install
@@ -222,13 +144,13 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d
### Postgres-specific terms
Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.
The same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used.
Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
To get more familiar with this aspect, refer to:
- [Neon glossary](/docs/glossary.md)
- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html)
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
- Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))
## Join the development

View File

@@ -4,6 +4,7 @@ version = "0.1.0"
edition = "2021"
[dependencies]
libc = "0.2"
anyhow = "1.0"
chrono = "0.4"
clap = "3.0"
@@ -17,5 +18,4 @@ serde_json = "1"
tar = "0.4"
tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
url = "2.2.2"
workspace_hack = { version = "0.1", path = "../workspace_hack" }

View File

@@ -1,9 +1,9 @@
# Compute node tools
Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
`ExecStart` option. It will handle all the `Neon` specifics during compute node
Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
`ExecStart` option. It will handle all the `zenith` specifics during compute node
initialization:
- `compute_ctl` accepts cluster (compute node) specification as a JSON file.
- `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
- Every start is a fresh start, so the data directory is removed and
initialized again on each run.
- Next it will put configuration files into the `PGDATA` directory.
@@ -13,18 +13,18 @@ initialization:
- Check and alter/drop/create roles and databases.
- Hang waiting on the `postmaster` process to exit.
Also `compute_ctl` spawns two separate service threads:
Also `zenith_ctl` spawns two separate service threads:
- `compute-monitor` checks the last Postgres activity timestamp and saves it
into the shared `ComputeNode`;
into the shared `ComputeState`;
- `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
last activity requests.
Usage example:
```sh
compute_ctl -D /var/db/postgres/compute \
-C 'postgresql://cloud_admin@localhost/postgres' \
-S /var/db/postgres/specs/current.json \
-b /usr/local/bin/postgres
zenith_ctl -D /var/db/postgres/compute \
-C 'postgresql://zenith_admin@localhost/postgres' \
-S /var/db/postgres/specs/current.json \
-b /usr/local/bin/postgres
```
## Tests

View File

@@ -1,175 +0,0 @@
//!
//! Postgres wrapper (`compute_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
//! `ExecStart` option. It will handle all the `Neon` specifics during compute node
//! initialization:
//! - `compute_ctl` accepts cluster (compute node) specification as a JSON file.
//! - Every start is a fresh start, so the data directory is removed and
//! initialized again on each run.
//! - Next it will put configuration files into the `PGDATA` directory.
//! - Sync safekeepers and get commit LSN.
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
//! - Try to start `postgres` and wait until it is ready to accept connections.
//! - Check and alter/drop/create roles and databases.
//! - Hang waiting on the `postmaster` process to exit.
//!
//! Also `compute_ctl` spawns two separate service threads:
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
//! into the shared `ComputeNode`;
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
//! last activity requests.
//!
//! Usage example:
//! ```sh
//! compute_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://cloud_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres
//! ```
//!
use std::fs::File;
use std::panic;
use std::path::Path;
use std::process::exit;
use std::sync::{Arc, RwLock};
use std::{thread, time::Duration};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use log::{error, info};
use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
use compute_tools::http::api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::pg_helpers::*;
use compute_tools::spec::*;
use url::Url;
fn main() -> Result<()> {
// TODO: re-use `utils::logging` later
init_logger(DEFAULT_LOG_LEVEL)?;
// Env variable is set by `cargo`
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
let matches = clap::App::new("compute_ctl")
.version(version.unwrap_or("unknown"))
.arg(
Arg::new("connstr")
.short('C')
.long("connstr")
.value_name("DATABASE_URL")
.required(true),
)
.arg(
Arg::new("pgdata")
.short('D')
.long("pgdata")
.value_name("DATADIR")
.required(true),
)
.arg(
Arg::new("pgbin")
.short('b')
.long("pgbin")
.value_name("POSTGRES_PATH"),
)
.arg(
Arg::new("spec")
.short('s')
.long("spec")
.value_name("SPEC_JSON"),
)
.arg(
Arg::new("spec-path")
.short('S')
.long("spec-path")
.value_name("SPEC_PATH"),
)
.get_matches();
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
let connstr = matches
.value_of("connstr")
.expect("Postgres connection string is required");
let spec = matches.value_of("spec");
let spec_path = matches.value_of("spec-path");
// Try to use just 'postgres' if no path is provided
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
let spec: ComputeSpec = match spec {
// First, try to get cluster spec from the cli argument
Some(json) => serde_json::from_str(json)?,
None => {
// Second, try to read it from the file if path is provided
if let Some(sp) = spec_path {
let path = Path::new(sp);
let file = File::open(path)?;
serde_json::from_reader(file)?
} else {
panic!("cluster spec should be provided via --spec or --spec-path argument");
}
}
};
let pageserver_connstr = spec
.cluster
.settings
.find("neon.pageserver_connstring")
.expect("pageserver connstr should be provided");
let tenant = spec
.cluster
.settings
.find("neon.tenant_id")
.expect("tenant id should be provided");
let timeline = spec
.cluster
.settings
.find("neon.timeline_id")
.expect("tenant id should be provided");
let compute_state = ComputeNode {
start_time: Utc::now(),
connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
spec,
tenant,
timeline,
pageserver_connstr,
metrics: ComputeMetrics::new(),
state: RwLock::new(ComputeState::new()),
};
let compute = Arc::new(compute_state);
// Launch service threads first, so we were able to serve availability
// requests, while configuration is still in progress.
let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
// Run compute (Postgres) and hang waiting on it.
match compute.prepare_and_run() {
Ok(ec) => {
let code = ec.code().unwrap_or(1);
info!("Postgres exited with code {}, shutting down", code);
exit(code)
}
Err(error) => {
error!("could not start the compute node: {:?}", error);
let mut state = compute.state.write().unwrap();
state.error = Some(format!("{:?}", error));
state.status = ComputeStatus::Failed;
drop(state);
// Keep serving HTTP requests, so the cloud control plane was able to
// get the actual error.
info!("giving control plane 30s to collect the error before shutdown");
thread::sleep(Duration::from_secs(30));
info!("shutting down");
Err(error)
}
}
}

View File

@@ -0,0 +1,252 @@
//!
//! Postgres wrapper (`zenith_ctl`) is intended to be run as a Docker entrypoint or as a `systemd`
//! `ExecStart` option. It will handle all the `zenith` specifics during compute node
//! initialization:
//! - `zenith_ctl` accepts cluster (compute node) specification as a JSON file.
//! - Every start is a fresh start, so the data directory is removed and
//! initialized again on each run.
//! - Next it will put configuration files into the `PGDATA` directory.
//! - Sync safekeepers and get commit LSN.
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
//! - Try to start `postgres` and wait until it is ready to accept connections.
//! - Check and alter/drop/create roles and databases.
//! - Hang waiting on the `postmaster` process to exit.
//!
//! Also `zenith_ctl` spawns two separate service threads:
//! - `compute-monitor` checks the last Postgres activity timestamp and saves it
//! into the shared `ComputeState`;
//! - `http-endpoint` runs a Hyper HTTP API server, which serves readiness and the
//! last activity requests.
//!
//! Usage example:
//! ```sh
//! zenith_ctl -D /var/db/postgres/compute \
//! -C 'postgresql://zenith_admin@localhost/postgres' \
//! -S /var/db/postgres/specs/current.json \
//! -b /usr/local/bin/postgres
//! ```
//!
use std::fs::File;
use std::panic;
use std::path::Path;
use std::process::{exit, Command, ExitStatus};
use std::sync::{Arc, RwLock};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Arg;
use log::info;
use postgres::{Client, NoTls};
use compute_tools::checker::create_writablity_check_data;
use compute_tools::config;
use compute_tools::http_api::launch_http_server;
use compute_tools::logger::*;
use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::pg_helpers::*;
use compute_tools::spec::*;
use compute_tools::zenith::*;
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
fn prepare_pgdata(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
let state = state.read().unwrap();
let spec = &state.spec;
let pgdata_path = Path::new(&state.pgdata);
let pageserver_connstr = spec
.cluster
.settings
.find("zenith.page_server_connstring")
.expect("pageserver connstr should be provided");
let tenant = spec
.cluster
.settings
.find("zenith.zenith_tenant")
.expect("tenant id should be provided");
let timeline = spec
.cluster
.settings
.find("zenith.zenith_timeline")
.expect("tenant id should be provided");
info!(
"starting cluster #{}, operation #{}",
spec.cluster.cluster_id,
spec.operation_uuid.as_ref().unwrap()
);
// Remove/create an empty pgdata directory and put configuration there.
create_pgdata(&state.pgdata)?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
info!("starting safekeepers syncing");
let lsn = sync_safekeepers(&state.pgdata, &state.pgbin)
.with_context(|| "failed to sync safekeepers")?;
info!("safekeepers synced at LSN {}", lsn);
info!(
"getting basebackup@{} from pageserver {}",
lsn, pageserver_connstr
);
get_basebackup(&state.pgdata, &pageserver_connstr, &tenant, &timeline, &lsn).with_context(
|| {
format!(
"failed to get basebackup@{} from pageserver {}",
lsn, pageserver_connstr
)
},
)?;
// Update pg_hba.conf received with basebackup.
update_pg_hba(pgdata_path)?;
Ok(())
}
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
fn run_compute(state: &Arc<RwLock<ComputeState>>) -> Result<ExitStatus> {
let read_state = state.read().unwrap();
let pgdata_path = Path::new(&read_state.pgdata);
// Run postgres as a child process.
let mut pg = Command::new(&read_state.pgbin)
.args(&["-D", &read_state.pgdata])
.spawn()
.expect("cannot start postgres process");
// Try default Postgres port if it is not provided
let port = read_state
.spec
.cluster
.settings
.find("port")
.unwrap_or_else(|| "5432".to_string());
wait_for_postgres(&port, pgdata_path)?;
let mut client = Client::connect(&read_state.connstr, NoTls)?;
handle_roles(&read_state.spec, &mut client)?;
handle_databases(&read_state.spec, &mut client)?;
handle_grants(&read_state.spec, &mut client)?;
create_writablity_check_data(&mut client)?;
// 'Close' connection
drop(client);
info!(
"finished configuration of cluster #{}",
read_state.spec.cluster.cluster_id
);
// Release the read lock.
drop(read_state);
// Get the write lock, update state and release the lock, so HTTP API
// was able to serve requests, while we are blocked waiting on
// Postgres.
let mut state = state.write().unwrap();
state.ready = true;
drop(state);
// Wait for child postgres process basically forever. In this state Ctrl+C
// will be propagated to postgres and it will be shut down as well.
let ecode = pg.wait().expect("failed to wait on postgres");
Ok(ecode)
}
fn main() -> Result<()> {
// TODO: re-use `utils::logging` later
init_logger(DEFAULT_LOG_LEVEL)?;
// Env variable is set by `cargo`
let version: Option<&str> = option_env!("CARGO_PKG_VERSION");
let matches = clap::App::new("zenith_ctl")
.version(version.unwrap_or("unknown"))
.arg(
Arg::new("connstr")
.short('C')
.long("connstr")
.value_name("DATABASE_URL")
.required(true),
)
.arg(
Arg::new("pgdata")
.short('D')
.long("pgdata")
.value_name("DATADIR")
.required(true),
)
.arg(
Arg::new("pgbin")
.short('b')
.long("pgbin")
.value_name("POSTGRES_PATH"),
)
.arg(
Arg::new("spec")
.short('s')
.long("spec")
.value_name("SPEC_JSON"),
)
.arg(
Arg::new("spec-path")
.short('S')
.long("spec-path")
.value_name("SPEC_PATH"),
)
.get_matches();
let pgdata = matches.value_of("pgdata").expect("PGDATA path is required");
let connstr = matches
.value_of("connstr")
.expect("Postgres connection string is required");
let spec = matches.value_of("spec");
let spec_path = matches.value_of("spec-path");
// Try to use just 'postgres' if no path is provided
let pgbin = matches.value_of("pgbin").unwrap_or("postgres");
let spec: ClusterSpec = match spec {
// First, try to get cluster spec from the cli argument
Some(json) => serde_json::from_str(json)?,
None => {
// Second, try to read it from the file if path is provided
if let Some(sp) = spec_path {
let path = Path::new(sp);
let file = File::open(path)?;
serde_json::from_reader(file)?
} else {
panic!("cluster spec should be provided via --spec or --spec-path argument");
}
}
};
let compute_state = ComputeState {
connstr: connstr.to_string(),
pgdata: pgdata.to_string(),
pgbin: pgbin.to_string(),
spec,
ready: false,
last_active: Utc::now(),
};
let compute_state = Arc::new(RwLock::new(compute_state));
// Launch service threads first, so we were able to serve availability
// requests, while configuration is still in progress.
let mut _threads = vec![
launch_http_server(&compute_state).expect("cannot launch compute monitor thread"),
launch_monitor(&compute_state).expect("cannot launch http endpoint thread"),
];
prepare_pgdata(&compute_state)?;
// Run compute (Postgres) and hang waiting on it. Panic if any error happens,
// it will help us to trigger unwind and kill postmaster as well.
match run_compute(&compute_state) {
Ok(ec) => exit(ec.success() as i32),
Err(error) => panic!("cannot start compute node, error: {}", error),
}
}

View File

@@ -1,9 +1,11 @@
use std::sync::{Arc, RwLock};
use anyhow::{anyhow, Result};
use log::error;
use postgres::Client;
use tokio_postgres::NoTls;
use crate::compute::ComputeNode;
use crate::zenith::ComputeState;
pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
let query = "
@@ -21,8 +23,9 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
Ok(())
}
pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
pub async fn check_writability(state: &Arc<RwLock<ComputeState>>) -> Result<()> {
let connstr = state.read().unwrap().connstr.clone();
let (client, connection) = tokio_postgres::connect(&connstr, NoTls).await?;
if client.is_closed() {
return Err(anyhow!("connection to postgres closed"));
}

View File

@@ -1,350 +0,0 @@
//
// XXX: This starts to be scarry similar to the `PostgresNode` from `control_plane`,
// but there are several things that makes `PostgresNode` usage inconvenient in the
// cloud:
// - it inherits from `LocalEnv`, which contains **all-all** the information about
// a complete service running
// - it uses `PageServerNode` with information about http endpoint, which we do not
// need in the cloud again
// - many tiny pieces like, for example, we do not use `pg_ctl` in the cloud
//
// Thus, to use `PostgresNode` in the cloud, we need to 'mock' a bunch of required
// attributes (not required for the cloud). Yet, it is still tempting to unify these
// `PostgresNode` and `ComputeNode` and use one in both places.
//
// TODO: stabilize `ComputeNode` and think about using it in the `control_plane`.
//
use std::fs;
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::{Command, ExitStatus, Stdio};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::RwLock;
use anyhow::{Context, Result};
use chrono::{DateTime, Utc};
use log::info;
use postgres::{Client, NoTls};
use serde::{Serialize, Serializer};
use crate::checker::create_writablity_check_data;
use crate::config;
use crate::pg_helpers::*;
use crate::spec::*;
/// Compute node info shared across several `compute_ctl` threads.
pub struct ComputeNode {
pub start_time: DateTime<Utc>,
// Url type maintains proper escaping
pub connstr: url::Url,
pub pgdata: String,
pub pgbin: String,
pub spec: ComputeSpec,
pub tenant: String,
pub timeline: String,
pub pageserver_connstr: String,
pub metrics: ComputeMetrics,
/// Volatile part of the `ComputeNode` so should be used under `RwLock`
/// to allow HTTP API server to serve status requests, while configuration
/// is in progress.
pub state: RwLock<ComputeState>,
}
fn rfc3339_serialize<S>(x: &DateTime<Utc>, s: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
x.to_rfc3339().serialize(s)
}
#[derive(Serialize)]
#[serde(rename_all = "snake_case")]
pub struct ComputeState {
pub status: ComputeStatus,
/// Timestamp of the last Postgres activity
#[serde(serialize_with = "rfc3339_serialize")]
pub last_active: DateTime<Utc>,
pub error: Option<String>,
}
impl ComputeState {
pub fn new() -> Self {
Self {
status: ComputeStatus::Init,
last_active: Utc::now(),
error: None,
}
}
}
impl Default for ComputeState {
fn default() -> Self {
Self::new()
}
}
#[derive(Serialize, Clone, Copy, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum ComputeStatus {
Init,
Running,
Failed,
}
#[derive(Serialize)]
pub struct ComputeMetrics {
pub sync_safekeepers_ms: AtomicU64,
pub basebackup_ms: AtomicU64,
pub config_ms: AtomicU64,
pub total_startup_ms: AtomicU64,
}
impl ComputeMetrics {
pub fn new() -> Self {
Self {
sync_safekeepers_ms: AtomicU64::new(0),
basebackup_ms: AtomicU64::new(0),
config_ms: AtomicU64::new(0),
total_startup_ms: AtomicU64::new(0),
}
}
}
impl Default for ComputeMetrics {
fn default() -> Self {
Self::new()
}
}
impl ComputeNode {
pub fn set_status(&self, status: ComputeStatus) {
self.state.write().unwrap().status = status;
}
pub fn get_status(&self) -> ComputeStatus {
self.state.read().unwrap().status
}
// Remove `pgdata` directory and create it again with right permissions.
fn create_pgdata(&self) -> Result<()> {
// Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
// If it is something different then create_dir() will error out anyway.
let _ok = fs::remove_dir_all(&self.pgdata);
fs::create_dir(&self.pgdata)?;
fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
Ok(())
}
// Get basebackup from the libpq connection to pageserver using `connstr` and
// unarchive it to `pgdata` directory overriding all its previous content.
fn get_basebackup(&self, lsn: &str) -> Result<()> {
let start_time = Utc::now();
let mut client = Client::connect(&self.pageserver_connstr, NoTls)?;
let basebackup_cmd = match lsn {
"0/0" => format!("basebackup {} {}", &self.tenant, &self.timeline), // First start of the compute
_ => format!("basebackup {} {} {}", &self.tenant, &self.timeline, lsn),
};
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
// Read the archive directly from the `CopyOutReader`
//
// Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it.
let mut ar = tar::Archive::new(copyreader);
ar.set_ignore_zeros(true);
ar.unpack(&self.pgdata)?;
self.metrics.basebackup_ms.store(
Utc::now()
.signed_duration_since(start_time)
.to_std()
.unwrap()
.as_millis() as u64,
Ordering::Relaxed,
);
Ok(())
}
// Run `postgres` in a special mode with `--sync-safekeepers` argument
// and return the reported LSN back to the caller.
fn sync_safekeepers(&self) -> Result<String> {
let start_time = Utc::now();
let sync_handle = Command::new(&self.pgbin)
.args(&["--sync-safekeepers"])
.env("PGDATA", &self.pgdata) // we cannot use -D in this mode
.stdout(Stdio::piped())
.spawn()
.expect("postgres --sync-safekeepers failed to start");
// `postgres --sync-safekeepers` will print all log output to stderr and
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
// redirected to the caller output.
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
if !sync_output.status.success() {
anyhow::bail!(
"postgres --sync-safekeepers exited with non-zero status: {}",
sync_output.status,
);
}
self.metrics.sync_safekeepers_ms.store(
Utc::now()
.signed_duration_since(start_time)
.to_std()
.unwrap()
.as_millis() as u64,
Ordering::Relaxed,
);
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
Ok(lsn)
}
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
pub fn prepare_pgdata(&self) -> Result<()> {
let spec = &self.spec;
let pgdata_path = Path::new(&self.pgdata);
// Remove/create an empty pgdata directory and put configuration there.
self.create_pgdata()?;
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
info!("starting safekeepers syncing");
let lsn = self
.sync_safekeepers()
.with_context(|| "failed to sync safekeepers")?;
info!("safekeepers synced at LSN {}", lsn);
info!(
"getting basebackup@{} from pageserver {}",
lsn, &self.pageserver_connstr
);
self.get_basebackup(&lsn).with_context(|| {
format!(
"failed to get basebackup@{} from pageserver {}",
lsn, &self.pageserver_connstr
)
})?;
// Update pg_hba.conf received with basebackup.
update_pg_hba(pgdata_path)?;
Ok(())
}
/// Start Postgres as a child process and manage DBs/roles.
/// After that this will hang waiting on the postmaster process to exit.
pub fn run(&self) -> Result<ExitStatus> {
let start_time = Utc::now();
let pgdata_path = Path::new(&self.pgdata);
// Run postgres as a child process.
let mut pg = Command::new(&self.pgbin)
.args(&["-D", &self.pgdata])
.spawn()
.expect("cannot start postgres process");
// Try default Postgres port if it is not provided
let port = self
.spec
.cluster
.settings
.find("port")
.unwrap_or_else(|| "5432".to_string());
wait_for_postgres(&mut pg, &port, pgdata_path)?;
// If connection fails,
// it may be the old node with `zenith_admin` superuser.
//
// In this case we need to connect with old `zenith_admin`name
// and create new user. We cannot simply rename connected user,
// but we can create a new one and grant it all privileges.
let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
Err(e) => {
info!(
"cannot connect to postgres: {}, retrying with `zenith_admin` username",
e
);
let mut zenith_admin_connstr = self.connstr.clone();
zenith_admin_connstr
.set_username("zenith_admin")
.map_err(|_| anyhow::anyhow!("invalid connstr"))?;
let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
client.simple_query("GRANT zenith_admin TO cloud_admin")?;
drop(client);
// reconnect with connsting with expected name
Client::connect(self.connstr.as_str(), NoTls)?
}
Ok(client) => client,
};
handle_roles(&self.spec, &mut client)?;
handle_databases(&self.spec, &mut client)?;
handle_role_deletions(self, &mut client)?;
handle_grants(self, &mut client)?;
create_writablity_check_data(&mut client)?;
// 'Close' connection
drop(client);
let startup_end_time = Utc::now();
self.metrics.config_ms.store(
startup_end_time
.signed_duration_since(start_time)
.to_std()
.unwrap()
.as_millis() as u64,
Ordering::Relaxed,
);
self.metrics.total_startup_ms.store(
startup_end_time
.signed_duration_since(self.start_time)
.to_std()
.unwrap()
.as_millis() as u64,
Ordering::Relaxed,
);
self.set_status(ComputeStatus::Running);
info!(
"finished configuration of compute for project {}",
self.spec.cluster.cluster_id
);
// Wait for child Postgres process basically forever. In this state Ctrl+C
// will propagate to Postgres and it will be shut down as well.
let ecode = pg
.wait()
.expect("failed to start waiting on Postgres process");
Ok(ecode)
}
pub fn prepare_and_run(&self) -> Result<ExitStatus> {
info!(
"starting compute for project {}, operation {}, tenant {}, timeline {}",
self.spec.cluster.cluster_id,
self.spec.operation_uuid.as_ref().unwrap(),
self.tenant,
self.timeline,
);
self.prepare_pgdata()?;
self.run()
}
}

View File

@@ -6,7 +6,7 @@ use std::path::Path;
use anyhow::Result;
use crate::pg_helpers::PgOptionsSerialize;
use crate::spec::ComputeSpec;
use crate::zenith::ClusterSpec;
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
@@ -32,20 +32,20 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
}
/// Create or completely rewrite configuration file specified by `path`
pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> {
pub fn write_postgres_conf(path: &Path, spec: &ClusterSpec) -> Result<()> {
// File::create() destroys the file content if it exists.
let mut postgres_conf = File::create(path)?;
write_auto_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
write_zenith_managed_block(&mut postgres_conf, &spec.cluster.settings.as_pg_settings())?;
Ok(())
}
// Write Postgres config block wrapped with generated comment section
fn write_auto_managed_block(file: &mut File, buf: &str) -> Result<()> {
writeln!(file, "# Managed by compute_ctl: begin")?;
fn write_zenith_managed_block(file: &mut File, buf: &str) -> Result<()> {
writeln!(file, "# Managed by Zenith: begin")?;
writeln!(file, "{}", buf)?;
writeln!(file, "# Managed by compute_ctl: end")?;
writeln!(file, "# Managed by Zenith: end")?;
Ok(())
}

View File

@@ -1 +0,0 @@
pub mod api;

View File

@@ -1,158 +0,0 @@
openapi: "3.0.2"
info:
title: Compute node control API
version: "1.0"
servers:
- url: "http://localhost:3080"
paths:
/status:
get:
tags:
- "info"
summary: Get compute node internal status
description: ""
operationId: getComputeStatus
responses:
"200":
description: ComputeState
content:
application/json:
schema:
$ref: "#/components/schemas/ComputeState"
/metrics.json:
get:
tags:
- "info"
summary: Get compute node startup metrics in JSON format
description: ""
operationId: getComputeMetricsJSON
responses:
"200":
description: ComputeMetrics
content:
application/json:
schema:
$ref: "#/components/schemas/ComputeMetrics"
/ready:
get:
deprecated: true
tags:
- "info"
summary: Check whether compute startup process finished successfully
description: ""
operationId: computeIsReady
responses:
"200":
description: Compute is ready ('true') or not ('false')
content:
text/plain:
schema:
type: string
example: "true"
/last_activity:
get:
deprecated: true
tags:
- "info"
summary: Get timestamp of the last compute activity
description: ""
operationId: getLastComputeActivityTS
responses:
"200":
description: Timestamp of the last compute activity
content:
text/plain:
schema:
type: string
example: "2022-10-12T07:20:50.52Z"
/check_writability:
get:
deprecated: true
tags:
- "check"
summary: Check that we can write new data on this compute
description: ""
operationId: checkComputeWritabilityDeprecated
responses:
"200":
description: Check result
content:
text/plain:
schema:
type: string
description: Error text or 'true' if check passed
example: "true"
post:
tags:
- "check"
summary: Check that we can write new data on this compute
description: ""
operationId: checkComputeWritability
responses:
"200":
description: Check result
content:
text/plain:
schema:
type: string
description: Error text or 'true' if check passed
example: "true"
components:
securitySchemes:
JWT:
type: http
scheme: bearer
bearerFormat: JWT
schemas:
ComputeMetrics:
type: object
description: Compute startup metrics
required:
- sync_safekeepers_ms
- basebackup_ms
- config_ms
- total_startup_ms
properties:
sync_safekeepers_ms:
type: integer
basebackup_ms:
type: integer
config_ms:
type: integer
total_startup_ms:
type: integer
ComputeState:
type: object
required:
- status
- last_active
properties:
status:
$ref: '#/components/schemas/ComputeStatus'
last_active:
type: string
description: The last detected compute activity timestamp in UTC and RFC3339 format
example: "2022-10-12T07:20:50.52Z"
error:
type: string
description: Text of the error during compute startup, if any
ComputeStatus:
type: string
enum:
- init
- failed
- running
security:
- JWT: []

View File

@@ -1,64 +1,37 @@
use std::convert::Infallible;
use std::net::SocketAddr;
use std::sync::Arc;
use std::sync::{Arc, RwLock};
use std::thread;
use anyhow::Result;
use hyper::service::{make_service_fn, service_fn};
use hyper::{Body, Method, Request, Response, Server, StatusCode};
use log::{error, info};
use serde_json;
use crate::compute::{ComputeNode, ComputeStatus};
use crate::zenith::*;
// Service function to handle all available routes.
async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
async fn routes(req: Request<Body>, state: Arc<RwLock<ComputeState>>) -> Response<Body> {
match (req.method(), req.uri().path()) {
// Timestamp of the last Postgres activity in the plain text.
// DEPRECATED in favour of /status
(&Method::GET, "/last_activity") => {
info!("serving /last_active GET request");
let state = compute.state.read().unwrap();
let state = state.read().unwrap();
// Use RFC3339 format for consistency.
Response::new(Body::from(state.last_active.to_rfc3339()))
}
// Has compute setup process finished? -> true/false.
// DEPRECATED in favour of /status
// Has compute setup process finished? -> true/false
(&Method::GET, "/ready") => {
info!("serving /ready GET request");
let status = compute.get_status();
Response::new(Body::from(format!("{}", status == ComputeStatus::Running)))
let state = state.read().unwrap();
Response::new(Body::from(format!("{}", state.ready)))
}
// Serialized compute state.
(&Method::GET, "/status") => {
info!("serving /status GET request");
let state = compute.state.read().unwrap();
Response::new(Body::from(serde_json::to_string(&*state).unwrap()))
}
// Startup metrics in JSON format. Keep /metrics reserved for a possible
// future use for Prometheus metrics format.
(&Method::GET, "/metrics.json") => {
info!("serving /metrics.json GET request");
Response::new(Body::from(serde_json::to_string(&compute.metrics).unwrap()))
}
// DEPRECATED, use POST instead
(&Method::GET, "/check_writability") => {
info!("serving /check_writability GET request");
let res = crate::checker::check_writability(&compute).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => Response::new(Body::from(e.to_string())),
}
}
(&Method::POST, "/check_writability") => {
info!("serving /check_writability POST request");
let res = crate::checker::check_writability(&compute).await;
let res = crate::checker::check_writability(&state).await;
match res {
Ok(_) => Response::new(Body::from("true")),
Err(e) => Response::new(Body::from(e.to_string())),
@@ -76,7 +49,7 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
// Main Hyper HTTP server function that runs it and blocks waiting on it forever.
#[tokio::main]
async fn serve(state: Arc<ComputeNode>) {
async fn serve(state: Arc<RwLock<ComputeState>>) {
let addr = SocketAddr::from(([0, 0, 0, 0], 3080));
let make_service = make_service_fn(move |_conn| {
@@ -100,7 +73,7 @@ async fn serve(state: Arc<ComputeNode>) {
}
/// Launch a separate Hyper HTTP API server thread and return its `JoinHandle`.
pub fn launch_http_server(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
pub fn launch_http_server(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()

View File

@@ -4,11 +4,11 @@
//!
pub mod checker;
pub mod config;
pub mod http;
pub mod http_api;
#[macro_use]
pub mod logger;
pub mod compute;
pub mod monitor;
pub mod params;
pub mod pg_helpers;
pub mod spec;
pub mod zenith;

View File

@@ -1,4 +1,4 @@
use std::sync::Arc;
use std::sync::{Arc, RwLock};
use std::{thread, time};
use anyhow::Result;
@@ -6,18 +6,18 @@ use chrono::{DateTime, Utc};
use log::{debug, info};
use postgres::{Client, NoTls};
use crate::compute::ComputeNode;
use crate::zenith::ComputeState;
const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
// Spin in a loop and figure out the last activity time in the Postgres.
// Then update it in the shared state. This function never errors out.
// XXX: the only expected panic is at `RwLock` unwrap().
fn watch_compute_activity(compute: &ComputeNode) {
fn watch_compute_activity(state: &Arc<RwLock<ComputeState>>) {
// Suppose that `connstr` doesn't change
let connstr = compute.connstr.as_str();
let connstr = state.read().unwrap().connstr.clone();
// Define `client` outside of the loop to reuse existing connection if it's active.
let mut client = Client::connect(connstr, NoTls);
let mut client = Client::connect(&connstr, NoTls);
let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);
info!("watching Postgres activity at {}", connstr);
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
info!("connection to postgres closed, trying to reconnect");
// Connection is closed, reconnect and try again.
client = Client::connect(connstr, NoTls);
client = Client::connect(&connstr, NoTls);
continue;
}
@@ -43,10 +43,10 @@ fn watch_compute_activity(compute: &ComputeNode) {
FROM pg_stat_activity
WHERE backend_type = 'client backend'
AND pid != pg_backend_pid()
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
AND usename != 'zenith_admin';", // XXX: find a better way to filter other monitors?
&[],
);
let mut last_active = compute.state.read().unwrap().last_active;
let mut last_active = state.read().unwrap().last_active;
if let Ok(backs) = backends {
let mut idle_backs: Vec<DateTime<Utc>> = vec![];
@@ -83,24 +83,24 @@ fn watch_compute_activity(compute: &ComputeNode) {
}
// Update the last activity in the shared state if we got a more recent one.
let mut state = compute.state.write().unwrap();
let mut state = state.write().unwrap();
if last_active > state.last_active {
state.last_active = last_active;
debug!("set the last compute activity time to: {}", last_active);
}
}
Err(e) => {
debug!("cannot connect to postgres: {}, retrying", e);
info!("cannot connect to postgres: {}, retrying", e);
// Establish a new connection and try again.
client = Client::connect(connstr, NoTls);
client = Client::connect(&connstr, NoTls);
}
}
}
}
/// Launch a separate compute monitor thread and return its `JoinHandle`.
pub fn launch_monitor(state: &Arc<ComputeNode>) -> Result<thread::JoinHandle<()>> {
pub fn launch_monitor(state: &Arc<RwLock<ComputeState>>) -> Result<thread::JoinHandle<()>> {
let state = Arc::clone(state);
Ok(thread::Builder::new()

View File

@@ -1,10 +1,7 @@
use std::fmt::Write;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::net::{SocketAddr, TcpStream};
use std::os::unix::fs::PermissionsExt;
use std::path::Path;
use std::process::Child;
use std::process::Command;
use std::str::FromStr;
use std::{fs, thread, time};
@@ -139,11 +136,9 @@ impl Role {
// Now we also support SCRAM-SHA-256 and to preserve compatibility
// we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
if pass.starts_with("SCRAM-SHA-256") {
write!(params, " PASSWORD '{pass}'")
.expect("String is documented to not to error during write operations");
params.push_str(&format!(" PASSWORD '{}'", pass));
} else {
write!(params, " PASSWORD 'md5{pass}'")
.expect("String is documented to not to error during write operations");
params.push_str(&format!(" PASSWORD 'md5{}'", pass));
}
} else {
params.push_str(" PASSWORD NULL");
@@ -161,8 +156,7 @@ impl Database {
/// it may require a proper quoting too.
pub fn to_pg_options(&self) -> String {
let mut params: String = self.options.as_pg_options();
write!(params, " OWNER {}", &self.owner.quote())
.expect("String is documented to not to error during write operations");
params.push_str(&format!(" OWNER {}", &self.owner.quote()));
params
}
@@ -226,12 +220,12 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
/// Wait for Postgres to become ready to accept connections:
/// - state should be `ready` in the `pgdata/postmaster.pid`
/// - and we should be able to connect to 127.0.0.1:5432
pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()> {
pub fn wait_for_postgres(port: &str, pgdata: &Path) -> Result<()> {
let pid_path = pgdata.join("postmaster.pid");
let mut slept: u64 = 0; // ms
let pause = time::Duration::from_millis(100);
let timeout = time::Duration::from_millis(10);
let timeout = time::Duration::from_millis(200);
let addr = SocketAddr::from_str(&format!("127.0.0.1:{}", port)).unwrap();
loop {
@@ -242,26 +236,19 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
bail!("timed out while waiting for Postgres to start");
}
if let Ok(Some(status)) = pg.try_wait() {
// Postgres exited, that is not what we expected, bail out earlier.
let code = status.code().unwrap_or(-1);
bail!("Postgres exited unexpectedly with code {}", code);
}
if pid_path.exists() {
// XXX: dumb and the simplest way to get the last line in a text file
// TODO: better use `.lines().last()` later
let stdout = Command::new("tail")
.args(&["-n1", pid_path.to_str().unwrap()])
.output()?
.stdout;
let status = String::from_utf8(stdout)?;
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
// Check that we can open pid file first.
if let Ok(file) = File::open(&pid_path) {
let file = BufReader::new(file);
let last_line = file.lines().last();
// Pid file could be there and we could read it, but it could be empty, for example.
if let Some(Ok(line)) = last_line {
let status = line.trim();
let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
// Now Postgres is ready to accept connections
if status == "ready" && can_connect {
break;
}
// Now Postgres is ready to accept connections
if status.trim() == "ready" && can_connect {
break;
}
}

View File

@@ -2,55 +2,17 @@ use std::path::Path;
use anyhow::Result;
use log::{info, log_enabled, warn, Level};
use postgres::{Client, NoTls};
use serde::Deserialize;
use postgres::Client;
use crate::compute::ComputeNode;
use crate::config;
use crate::params::PG_HBA_ALL_MD5;
use crate::pg_helpers::*;
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[derive(Clone, Deserialize)]
pub struct ComputeSpec {
pub format_version: f32,
pub timestamp: String,
pub operation_uuid: Option<String>,
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
}
/// Cluster state seen from the perspective of the external tools
/// like Rails web console.
#[derive(Clone, Deserialize)]
pub struct Cluster {
pub cluster_id: String,
pub name: String,
pub state: Option<String>,
pub roles: Vec<Role>,
pub databases: Vec<Database>,
pub settings: GenericOptions,
}
/// Single cluster state changing operation that could not be represented as
/// a static `Cluster` structure. For example:
/// - DROP DATABASE
/// - DROP ROLE
/// - ALTER ROLE name RENAME TO new_name
/// - ALTER DATABASE name RENAME TO new_name
#[derive(Clone, Deserialize)]
pub struct DeltaOp {
pub action: String,
pub name: PgIdent,
pub new_name: Option<PgIdent>,
}
use crate::zenith::ClusterSpec;
/// It takes cluster specification and does the following:
/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
/// - Update `pg_hba.conf` to allow external connections.
pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
pub fn handle_configuration(spec: &ClusterSpec, pgdata_path: &Path) -> Result<()> {
// File `postgresql.conf` is no longer included into `basebackup`, so just
// always write all config into it creating new file.
config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?;
@@ -77,7 +39,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
/// Given a cluster spec json and open transaction it handles roles creation,
/// deletion and update.
pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
pub fn handle_roles(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
let mut xact = client.transaction()?;
let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
@@ -98,13 +60,18 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
// Process delta operations first
if let Some(ops) = &spec.delta_operations {
info!("processing role renames");
info!("processing delta operations on roles");
for op in ops {
match op.action.as_ref() {
// We do not check either role exists or not,
// Postgres will take care of it for us
"delete_role" => {
// no-op now, roles will be deleted at the end of configuration
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
warn!("deleting role '{}'", &op.name);
xact.execute(query.as_str(), &[])?;
}
// Renaming role drops its password, since role name is
// Renaming role drops its password, since tole name is
// used as a salt there. It is important that this role
// is recorded with a new `name` in the `roles` list.
// Follow up roles update will set the new password.
@@ -169,20 +136,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
xact.execute(query.as_str(), &[])?;
}
} else {
info!("role name: '{}'", &name);
info!("role name {}", &name);
let mut query: String = format!("CREATE ROLE {} ", name.quote());
info!("role create query: '{}'", &query);
info!("role create query {}", &query);
info_print!(" -> create");
query.push_str(&role.to_pg_options());
xact.execute(query.as_str(), &[])?;
let grant_query = format!(
"GRANT pg_read_all_data, pg_write_all_data TO {}",
name.quote()
);
xact.execute(grant_query.as_str(), &[])?;
info!("role grant query: '{}'", &grant_query);
}
info_print!("\n");
@@ -193,76 +153,12 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
Ok(())
}
/// Reassign all dependent objects and delete requested roles.
pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
let spec = &node.spec;
// First, reassign all dependent objects to db owners.
if let Some(ops) = &spec.delta_operations {
info!("reassigning dependent objects of to-be-deleted roles");
for op in ops {
if op.action == "delete_role" {
reassign_owned_objects(node, &op.name)?;
}
}
}
// Second, proceed with role deletions.
let mut xact = client.transaction()?;
if let Some(ops) = &spec.delta_operations {
info!("processing role deletions");
for op in ops {
// We do not check either role exists or not,
// Postgres will take care of it for us
if op.action == "delete_role" {
let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
warn!("deleting role '{}'", &op.name);
xact.execute(query.as_str(), &[])?;
}
}
}
Ok(())
}
// Reassign all owned objects in all databases to the owner of the database.
fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
for db in &node.spec.cluster.databases {
if db.owner != *role_name {
let mut connstr = node.connstr.clone();
// database name is always the last and the only component of the path
connstr.set_path(&db.name);
let mut client = Client::connect(connstr.as_str(), NoTls)?;
// This will reassign all dependent objects to the db owner
let reassign_query = format!(
"REASSIGN OWNED BY {} TO {}",
role_name.quote(),
db.owner.quote()
);
info!(
"reassigning objects owned by '{}' in db '{}' to '{}'",
role_name, &db.name, &db.owner
);
client.simple_query(&reassign_query)?;
// This now will only drop privileges of the role
let drop_query = format!("DROP OWNED BY {}", role_name.quote());
client.simple_query(&drop_query)?;
}
}
Ok(())
}
/// It follows mostly the same logic as `handle_roles()` excepting that we
/// does not use an explicit transactions block, since major database operations
/// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
/// atomicity should be enough here due to the order of operations and various checks,
/// which together provide us idempotency.
pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
pub fn handle_databases(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
// Print a list of existing Postgres databases (only in debug mode)
@@ -349,80 +245,23 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
Ok(())
}
/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
/// to allow users creating trusted extensions and re-creating `public` schema, for example.
pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
let spec = &node.spec;
// Grant CREATE ON DATABASE to the database owner
// to allow clients create trusted extensions.
pub fn handle_grants(spec: &ClusterSpec, client: &mut Client) -> Result<()> {
info!("cluster spec grants:");
// We now have a separate `web_access` role to connect to the database
// via the web interface and proxy link auth. And also we grant a
// read / write all data privilege to every role. So also grant
// create to everyone.
// XXX: later we should stop messing with Postgres ACL in such horrible
// ways.
let roles = spec
.cluster
.roles
.iter()
.map(|r| r.name.quote())
.collect::<Vec<_>>();
for db in &spec.cluster.databases {
let dbname = &db.name;
let query: String = format!(
"GRANT CREATE ON DATABASE {} TO {}",
dbname.quote(),
roles.join(", ")
db.owner.quote()
);
info!("grant query {}", &query);
client.execute(query.as_str(), &[])?;
}
// Do some per-database access adjustments. We'd better do this at db creation time,
// but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
// atomically.
let mut db_connstr = node.connstr.clone();
for db in &node.spec.cluster.databases {
// database name is always the last and the only component of the path
db_connstr.set_path(&db.name);
let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
// This will only change ownership on the schema itself, not the objects
// inside it. Without it owner of the `public` schema will be `cloud_admin`
// and database owner cannot do anything with it. SQL procedure ensures
// that it won't error out if schema `public` doesn't exist.
let alter_query = format!(
"DO $$\n\
DECLARE\n\
schema_owner TEXT;\n\
BEGIN\n\
IF EXISTS(\n\
SELECT nspname\n\
FROM pg_catalog.pg_namespace\n\
WHERE nspname = 'public'\n\
)\n\
THEN\n\
SELECT nspowner::regrole::text\n\
FROM pg_catalog.pg_namespace\n\
WHERE nspname = 'public'\n\
INTO schema_owner;\n\
\n\
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\
THEN\n\
ALTER SCHEMA public OWNER TO {};\n\
END IF;\n\
END IF;\n\
END\n\
$$;",
db.owner.quote()
);
db_client.simple_query(&alter_query)?;
}
Ok(())
}

109
compute_tools/src/zenith.rs Normal file
View File

@@ -0,0 +1,109 @@
use std::process::{Command, Stdio};
use anyhow::Result;
use chrono::{DateTime, Utc};
use postgres::{Client, NoTls};
use serde::Deserialize;
use crate::pg_helpers::*;
/// Compute node state shared across several `zenith_ctl` threads.
/// Should be used under `RwLock` to allow HTTP API server to serve
/// status requests, while configuration is in progress.
pub struct ComputeState {
pub connstr: String,
pub pgdata: String,
pub pgbin: String,
pub spec: ClusterSpec,
/// Compute setup process has finished
pub ready: bool,
/// Timestamp of the last Postgres activity
pub last_active: DateTime<Utc>,
}
/// Cluster spec or configuration represented as an optional number of
/// delta operations + final cluster state description.
#[derive(Clone, Deserialize)]
pub struct ClusterSpec {
pub format_version: f32,
pub timestamp: String,
pub operation_uuid: Option<String>,
/// Expected cluster state at the end of transition process.
pub cluster: Cluster,
pub delta_operations: Option<Vec<DeltaOp>>,
}
/// Cluster state seen from the perspective of the external tools
/// like Rails web console.
#[derive(Clone, Deserialize)]
pub struct Cluster {
pub cluster_id: String,
pub name: String,
pub state: Option<String>,
pub roles: Vec<Role>,
pub databases: Vec<Database>,
pub settings: GenericOptions,
}
/// Single cluster state changing operation that could not be represented as
/// a static `Cluster` structure. For example:
/// - DROP DATABASE
/// - DROP ROLE
/// - ALTER ROLE name RENAME TO new_name
/// - ALTER DATABASE name RENAME TO new_name
#[derive(Clone, Deserialize)]
pub struct DeltaOp {
pub action: String,
pub name: PgIdent,
pub new_name: Option<PgIdent>,
}
/// Get basebackup from the libpq connection to pageserver using `connstr` and
/// unarchive it to `pgdata` directory overriding all its previous content.
pub fn get_basebackup(
pgdata: &str,
connstr: &str,
tenant: &str,
timeline: &str,
lsn: &str,
) -> Result<()> {
let mut client = Client::connect(connstr, NoTls)?;
let basebackup_cmd = match lsn {
"0/0" => format!("basebackup {} {}", tenant, timeline), // First start of the compute
_ => format!("basebackup {} {} {}", tenant, timeline, lsn),
};
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
let mut ar = tar::Archive::new(copyreader);
ar.unpack(&pgdata)?;
Ok(())
}
/// Run `postgres` in a special mode with `--sync-safekeepers` argument
/// and return the reported LSN back to the caller.
pub fn sync_safekeepers(pgdata: &str, pgbin: &str) -> Result<String> {
let sync_handle = Command::new(&pgbin)
.args(&["--sync-safekeepers"])
.env("PGDATA", &pgdata) // we cannot use -D in this mode
.stdout(Stdio::piped())
.spawn()
.expect("postgres --sync-safekeepers failed to start");
// `postgres --sync-safekeepers` will print all log output to stderr and
// final LSN to stdout. So we pipe only stdout, while stderr will be automatically
// redirected to the caller output.
let sync_output = sync_handle
.wait_with_output()
.expect("postgres --sync-safekeepers failed");
if !sync_output.status.success() {
anyhow::bail!(
"postgres --sync-safekeepers exited with non-zero status: {}",
sync_output.status,
);
}
let lsn = String::from(String::from_utf8(sync_output.stdout)?.trim());
Ok(lsn)
}

View File

@@ -85,7 +85,7 @@
"vartype": "bool"
},
{
"name": "safekeepers",
"name": "wal_acceptors",
"value": "127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501",
"vartype": "string"
},
@@ -150,7 +150,7 @@
"vartype": "integer"
},
{
"name": "neon.tenant_id",
"name": "zenith.zenith_tenant",
"value": "b0554b632bd4d547a63b86c3630317e8",
"vartype": "string"
},
@@ -160,13 +160,13 @@
"vartype": "integer"
},
{
"name": "neon.timeline_id",
"name": "zenith.zenith_timeline",
"value": "2414a61ffc94e428f14b5758fe308e13",
"vartype": "string"
},
{
"name": "shared_preload_libraries",
"value": "neon",
"value": "zenith",
"vartype": "string"
},
{
@@ -175,7 +175,7 @@
"vartype": "string"
},
{
"name": "neon.pageserver_connstring",
"name": "zenith.page_server_connstring",
"value": "host=127.0.0.1 port=6400",
"vartype": "string"
}

View File

@@ -4,12 +4,12 @@ mod pg_helpers_tests {
use std::fs::File;
use compute_tools::pg_helpers::*;
use compute_tools::spec::ComputeSpec;
use compute_tools::zenith::ClusterSpec;
#[test]
fn params_serialize() {
let file = File::open("tests/cluster_spec.json").unwrap();
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
assert_eq!(
spec.cluster.databases.first().unwrap().to_pg_options(),
@@ -24,11 +24,11 @@ mod pg_helpers_tests {
#[test]
fn settings_serialize() {
let file = File::open("tests/cluster_spec.json").unwrap();
let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
let spec: ClusterSpec = serde_json::from_reader(file).unwrap();
assert_eq!(
spec.cluster.settings.as_pg_settings(),
"fsync = off\nwal_level = replica\nhot_standby = on\nsafekeepers = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nneon.tenant_id = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nneon.timeline_id = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'neon'\nsynchronous_standby_names = 'walproposer'\nneon.pageserver_connstring = 'host=127.0.0.1 port=6400'"
"fsync = off\nwal_level = replica\nhot_standby = on\nwal_acceptors = '127.0.0.1:6502,127.0.0.1:6503,127.0.0.1:6501'\nwal_log_hints = on\nlog_connections = on\nshared_buffers = 32768\nport = 55432\nmax_connections = 100\nmax_wal_senders = 10\nlisten_addresses = '0.0.0.0'\nwal_sender_timeout = 0\npassword_encryption = md5\nmaintenance_work_mem = 65536\nmax_parallel_workers = 8\nmax_worker_processes = 8\nzenith.zenith_tenant = 'b0554b632bd4d547a63b86c3630317e8'\nmax_replication_slots = 10\nzenith.zenith_timeline = '2414a61ffc94e428f14b5758fe308e13'\nshared_preload_libraries = 'zenith'\nsynchronous_standby_names = 'walproposer'\nzenith.page_server_connstring = 'host=127.0.0.1 port=6400'"
);
}

View File

@@ -4,16 +4,17 @@ version = "0.1.0"
edition = "2021"
[dependencies]
tar = "0.4.38"
tar = "0.4.33"
postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
serde = { version = "1.0", features = ["derive"] }
serde_with = "1.12.0"
toml = "0.5"
once_cell = "1.13.0"
lazy_static = "1.4"
regex = "1"
anyhow = "1.0"
thiserror = "1"
nix = "0.23"
url = "2.2.2"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
pageserver = { path = "../pageserver" }

View File

@@ -9,6 +9,3 @@ auth_type = 'Trust'
id = 1
pg_port = 5454
http_port = 7676
[etcd_broker]
broker_endpoints = ['http://127.0.0.1:2379']

View File

@@ -148,9 +148,9 @@ impl PostgresNode {
// Read a few options from the config file
let context = format!("in config file {}", cfg_path_str);
let port: u16 = conf.parse_field("port", &context)?;
let timeline_id: ZTimelineId = conf.parse_field("neon.timeline_id", &context)?;
let tenant_id: ZTenantId = conf.parse_field("neon.tenant_id", &context)?;
let uses_wal_proposer = conf.get("safekeepers").is_some();
let timeline_id: ZTimelineId = conf.parse_field("zenith.zenith_timeline", &context)?;
let tenant_id: ZTenantId = conf.parse_field("zenith.zenith_tenant", &context)?;
let uses_wal_proposer = conf.get("wal_acceptors").is_some();
// parse recovery_target_lsn, if any
let recovery_target_lsn: Option<Lsn> =
@@ -231,13 +231,8 @@ impl PostgresNode {
.context("page server 'basebackup' command failed")?;
// Read the archive directly from the `CopyOutReader`
//
// Set `ignore_zeros` so that unpack() reads all the Copy data and
// doesn't stop at the end-of-archive marker. Otherwise, if the server
// sends an Error after finishing the tarball, we will not notice it.
let mut ar = tar::Archive::new(copyreader);
ar.set_ignore_zeros(true);
ar.unpack(&self.pgdata())
tar::Archive::new(copyreader)
.unpack(&self.pgdata())
.context("extracting base backup failed")?;
Ok(())
@@ -279,8 +274,6 @@ impl PostgresNode {
conf.append("listen_addresses", &self.address.ip().to_string());
conf.append("port", &self.address.port().to_string());
conf.append("wal_keep_size", "0");
// walproposer panics when basebackup is invalid, it is pointless to restart in this case.
conf.append("restart_after_crash", "off");
// Configure the node to fetch pages from pageserver
let pageserver_connstr = {
@@ -303,11 +296,11 @@ impl PostgresNode {
// uses only needed variables namely host, port, user, password.
format!("postgresql://no_user:{}@{}:{}", password, host, port)
};
conf.append("shared_preload_libraries", "neon");
conf.append("shared_preload_libraries", "zenith");
conf.append_line("");
conf.append("neon.pageserver_connstring", &pageserver_connstr);
conf.append("neon.tenant_id", &self.tenant_id.to_string());
conf.append("neon.timeline_id", &self.timeline_id.to_string());
conf.append("zenith.page_server_connstring", &pageserver_connstr);
conf.append("zenith.zenith_tenant", &self.tenant_id.to_string());
conf.append("zenith.zenith_timeline", &self.timeline_id.to_string());
if let Some(lsn) = self.lsn {
conf.append("recovery_target_lsn", &lsn.to_string());
}
@@ -341,7 +334,7 @@ impl PostgresNode {
.map(|sk| format!("localhost:{}", sk.pg_port))
.collect::<Vec<String>>()
.join(",");
conf.append("safekeepers", &safekeepers);
conf.append("wal_acceptors", &safekeepers);
} else {
// We only use setup without safekeepers for tests,
// and don't care about data durability on pageserver,
@@ -352,6 +345,7 @@ impl PostgresNode {
// This isn't really a supported configuration, but can be useful for
// testing.
conf.append("synchronous_standby_names", "pageserver");
conf.append("zenith.callmemaybe_connstring", &self.connstr());
}
let mut file = File::create(self.pgdata().join("postgresql.conf"))?;
@@ -498,7 +492,7 @@ impl PostgresNode {
"host={} port={} user={} dbname={}",
self.address.ip(),
self.address.port(),
"cloud_admin",
"zenith_admin",
"postgres"
)
}

View File

@@ -1,97 +0,0 @@
use std::{
fs,
path::PathBuf,
process::{Command, Stdio},
};
use anyhow::Context;
use nix::{
sys::signal::{kill, Signal},
unistd::Pid,
};
use crate::{local_env, read_pidfile};
pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_broker = &env.etcd_broker;
println!(
"Starting etcd broker using {}",
etcd_broker.etcd_binary_path.display()
);
let etcd_data_dir = env.base_data_dir.join("etcd");
fs::create_dir_all(&etcd_data_dir).with_context(|| {
format!(
"Failed to create etcd data dir: {}",
etcd_data_dir.display()
)
})?;
let etcd_stdout_file =
fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
format!(
"Failed to create etcd stout file in directory {}",
etcd_data_dir.display()
)
})?;
let etcd_stderr_file =
fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
format!(
"Failed to create etcd stderr file in directory {}",
etcd_data_dir.display()
)
})?;
let client_urls = etcd_broker.comma_separated_endpoints();
let etcd_process = Command::new(&etcd_broker.etcd_binary_path)
.args(&[
format!("--data-dir={}", etcd_data_dir.display()),
format!("--listen-client-urls={client_urls}"),
format!("--advertise-client-urls={client_urls}"),
// Set --quota-backend-bytes to keep the etcd virtual memory
// size smaller. Our test etcd clusters are very small.
// See https://github.com/etcd-io/etcd/issues/7910
"--quota-backend-bytes=100000000".to_string(),
])
.stdout(Stdio::from(etcd_stdout_file))
.stderr(Stdio::from(etcd_stderr_file))
.spawn()
.context("Failed to spawn etcd subprocess")?;
let pid = etcd_process.id();
let etcd_pid_file_path = etcd_pid_file_path(env);
fs::write(&etcd_pid_file_path, pid.to_string()).with_context(|| {
format!(
"Failed to create etcd pid file at {}",
etcd_pid_file_path.display()
)
})?;
Ok(())
}
pub fn stop_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let etcd_path = &env.etcd_broker.etcd_binary_path;
println!("Stopping etcd broker at {}", etcd_path.display());
let etcd_pid_file_path = etcd_pid_file_path(env);
let pid = Pid::from_raw(read_pidfile(&etcd_pid_file_path).with_context(|| {
format!(
"Failed to read etcd pid file at {}",
etcd_pid_file_path.display()
)
})?);
kill(pid, Signal::SIGTERM).with_context(|| {
format!(
"Failed to stop etcd with pid {pid} at {}",
etcd_pid_file_path.display()
)
})?;
Ok(())
}
fn etcd_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
env.base_data_dir.join("etcd.pid")
}

View File

@@ -12,7 +12,6 @@ use std::path::Path;
use std::process::Command;
pub mod compute;
pub mod etcd;
pub mod local_env;
pub mod postgresql_conf;
pub mod safekeeper;
@@ -49,16 +48,3 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
cmd
}
}
fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
for env_key in [
"AWS_ACCESS_KEY_ID",
"AWS_SECRET_ACCESS_KEY",
"AWS_SESSION_TOKEN",
] {
if let Ok(value) = std::env::var(env_key) {
cmd = cmd.env(env_key, value);
}
}
cmd
}

View File

@@ -4,7 +4,6 @@
//! script which will use local paths.
use anyhow::{bail, ensure, Context};
use reqwest::Url;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DisplayFromStr};
use std::collections::HashMap;
@@ -15,15 +14,15 @@ use std::process::{Command, Stdio};
use utils::{
auth::{encode_from_key_file, Claims, Scope},
postgres_backend::AuthType,
zid::{NodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
zid::{ZNodeId, ZTenantId, ZTenantTimelineId, ZTimelineId},
};
use crate::safekeeper::SafekeeperNode;
//
// This data structures represents neon_local CLI config
// This data structures represents zenith CLI config
//
// It is deserialized from the .neon/config file, or the config file passed
// It is deserialized from the .zenith/config file, or the config file passed
// to 'zenith init --config=<path>' option. See control_plane/simple.conf for
// an example.
//
@@ -34,8 +33,8 @@ pub struct LocalEnv {
// compute nodes).
//
// This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable or
// '.neon' if not given.
// config file itself is. It is read from the ZENITH_REPO_DIR env variable or
// '.zenith' if not given.
#[serde(skip)]
pub base_data_dir: PathBuf,
@@ -60,7 +59,9 @@ pub struct LocalEnv {
#[serde(default)]
pub private_key_path: PathBuf,
pub etcd_broker: EtcdBroker,
// A comma separated broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
#[serde(default)]
pub broker_endpoints: Option<String>,
pub pageserver: PageServerConf,
@@ -76,75 +77,11 @@ pub struct LocalEnv {
branch_name_mappings: HashMap<String, Vec<(ZTenantId, ZTimelineId)>>,
}
/// Etcd broker config for cluster internal communication.
#[serde_as]
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
pub struct EtcdBroker {
/// A prefix to all to any key when pushing/polling etcd from a node.
#[serde(default)]
pub broker_etcd_prefix: Option<String>,
/// Broker (etcd) endpoints for storage nodes coordination, e.g. 'http://127.0.0.1:2379'.
#[serde(default)]
#[serde_as(as = "Vec<DisplayFromStr>")]
pub broker_endpoints: Vec<Url>,
/// Etcd binary path to use.
#[serde(default)]
pub etcd_binary_path: PathBuf,
}
impl EtcdBroker {
pub fn locate_etcd() -> anyhow::Result<PathBuf> {
let which_output = Command::new("which")
.arg("etcd")
.output()
.context("Failed to run 'which etcd' command")?;
let stdout = String::from_utf8_lossy(&which_output.stdout);
ensure!(
which_output.status.success(),
"'which etcd' invocation failed. Status: {}, stdout: {stdout}, stderr: {}",
which_output.status,
String::from_utf8_lossy(&which_output.stderr)
);
let etcd_path = PathBuf::from(stdout.trim());
ensure!(
etcd_path.is_file(),
"'which etcd' invocation was successful, but the path it returned is not a file or does not exist: {}",
etcd_path.display()
);
Ok(etcd_path)
}
pub fn comma_separated_endpoints(&self) -> String {
self.broker_endpoints
.iter()
.map(|url| {
// URL by default adds a '/' path at the end, which is not what etcd CLI wants.
let url_string = url.as_str();
if url_string.ends_with('/') {
&url_string[0..url_string.len() - 1]
} else {
url_string
}
})
.fold(String::new(), |mut comma_separated_urls, url| {
if !comma_separated_urls.is_empty() {
comma_separated_urls.push(',');
}
comma_separated_urls.push_str(url);
comma_separated_urls
})
}
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct PageServerConf {
// node id
pub id: NodeId,
pub id: ZNodeId,
// Pageserver connection settings
pub listen_pg_addr: String,
pub listen_http_addr: String,
@@ -159,7 +96,7 @@ pub struct PageServerConf {
impl Default for PageServerConf {
fn default() -> Self {
Self {
id: NodeId(0),
id: ZNodeId(0),
listen_pg_addr: String::new(),
listen_http_addr: String::new(),
auth_type: AuthType::Trust,
@@ -171,25 +108,19 @@ impl Default for PageServerConf {
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
#[serde(default)]
pub struct SafekeeperConf {
pub id: NodeId,
pub id: ZNodeId,
pub pg_port: u16,
pub http_port: u16,
pub sync: bool,
pub remote_storage: Option<String>,
pub backup_threads: Option<u32>,
pub auth_enabled: bool,
}
impl Default for SafekeeperConf {
fn default() -> Self {
Self {
id: NodeId(0),
id: ZNodeId(0),
pg_port: 0,
http_port: 0,
sync: true,
remote_storage: None,
backup_threads: None,
auth_enabled: false,
}
}
}
@@ -249,7 +180,12 @@ impl LocalEnv {
if old_timeline_id == &timeline_id {
Ok(())
} else {
bail!("branch '{branch_name}' is already mapped to timeline {old_timeline_id}, cannot map to another timeline {timeline_id}");
bail!(
"branch '{}' is already mapped to timeline {}, cannot map to another timeline {}",
branch_name,
old_timeline_id,
timeline_id
);
}
} else {
existing_values.push((tenant_id, timeline_id));
@@ -285,7 +221,7 @@ impl LocalEnv {
///
/// Unlike 'load_config', this function fills in any defaults that are missing
/// from the config file.
pub fn parse_config(toml: &str) -> anyhow::Result<Self> {
pub fn create_config(toml: &str) -> anyhow::Result<Self> {
let mut env: LocalEnv = toml::from_str(toml)?;
// Find postgres binaries.
@@ -298,11 +234,26 @@ impl LocalEnv {
env.pg_distrib_dir = cwd.join("tmp_install")
}
}
if !env.pg_distrib_dir.join("bin/postgres").exists() {
bail!(
"Can't find postgres binary at {}",
env.pg_distrib_dir.display()
);
}
// Find zenith binaries.
if env.zenith_distrib_dir == Path::new("") {
env.zenith_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
}
for binary in ["pageserver", "safekeeper"] {
if !env.zenith_distrib_dir.join(binary).exists() {
bail!(
"Can't find binary '{}' in zenith distrib dir '{}'",
binary,
env.zenith_distrib_dir.display()
);
}
}
// If no initial tenant ID was given, generate it.
if env.default_tenant_id.is_none() {
@@ -339,7 +290,7 @@ impl LocalEnv {
pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
// Currently, the user first passes a config file with 'zenith init --config=<path>'
// We read that in, in `create_config`, and fill any missing defaults. Then it's saved
// to .neon/config. TODO: We lose any formatting and comments along the way, which is
// to .zenith/config. TODO: We lose any formatting and comments along the way, which is
// a bit sad.
let mut conf_content = r#"# This file describes a locale deployment of the page server
# and safekeeeper node. It is read by the 'zenith' command-line
@@ -391,26 +342,11 @@ impl LocalEnv {
base_path != Path::new(""),
"repository base path is missing"
);
ensure!(
!base_path.exists(),
"directory '{}' already exists. Perhaps already initialized?",
base_path.display()
);
if !self.pg_distrib_dir.join("bin/postgres").exists() {
bail!(
"Can't find postgres binary at {}",
self.pg_distrib_dir.display()
);
}
for binary in ["pageserver", "safekeeper"] {
if !self.zenith_distrib_dir.join(binary).exists() {
bail!(
"Can't find binary '{binary}' in zenith distrib dir '{}'",
self.zenith_distrib_dir.display()
);
}
}
fs::create_dir(&base_path)?;
@@ -467,36 +403,8 @@ impl LocalEnv {
}
fn base_path() -> PathBuf {
match std::env::var_os("NEON_REPO_DIR") {
Some(val) => PathBuf::from(val),
None => PathBuf::from(".neon"),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn simple_conf_parsing() {
let simple_conf_toml = include_str!("../simple.conf");
let simple_conf_parse_result = LocalEnv::parse_config(simple_conf_toml);
assert!(
simple_conf_parse_result.is_ok(),
"failed to parse simple config {simple_conf_toml}, reason: {simple_conf_parse_result:?}"
);
let string_to_replace = "broker_endpoints = ['http://127.0.0.1:2379']";
let spoiled_url_str = "broker_endpoints = ['!@$XOXO%^&']";
let spoiled_url_toml = simple_conf_toml.replace(string_to_replace, spoiled_url_str);
assert!(
spoiled_url_toml.contains(spoiled_url_str),
"Failed to replace string {string_to_replace} in the toml file {simple_conf_toml}"
);
let spoiled_url_parse_result = LocalEnv::parse_config(&spoiled_url_toml);
assert!(
spoiled_url_parse_result.is_err(),
"expected toml with invalid Url {spoiled_url_toml} to fail the parsing, but got {spoiled_url_parse_result:?}"
);
match std::env::var_os("ZENITH_REPO_DIR") {
Some(val) => PathBuf::from(val.to_str().unwrap()),
None => ".zenith".into(),
}
}

View File

@@ -5,7 +5,7 @@
/// enough to extract a few settings we need in Zenith, assuming you don't do
/// funny stuff like include-directives or funny escaping.
use anyhow::{bail, Context, Result};
use once_cell::sync::Lazy;
use lazy_static::lazy_static;
use regex::Regex;
use std::collections::HashMap;
use std::fmt;
@@ -19,7 +19,9 @@ pub struct PostgresConf {
hash: HashMap<String, String>,
}
static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());
lazy_static! {
static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
}
impl PostgresConf {
pub fn new() -> PostgresConf {
@@ -137,10 +139,10 @@ fn escape_str(s: &str) -> String {
//
// This regex is a bit more conservative than the rules in guc-file.l, so we quote some
// strings that PostgreSQL would accept without quoting, but that's OK.
static UNQUOTED_RE: Lazy<Regex> =
Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap());
lazy_static! {
static ref UNQUOTED_RE: Regex =
Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
}
if UNQUOTED_RE.is_match(s) {
s.to_string()
} else {

View File

@@ -18,12 +18,12 @@ use thiserror::Error;
use utils::{
connstring::connection_address,
http::error::HttpErrorBody,
zid::{NodeId, ZTenantId, ZTimelineId},
zid::{ZNodeId, ZTenantId, ZTimelineId},
};
use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::storage::PageServerNode;
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
use crate::{fill_rust_env_vars, read_pidfile};
#[derive(Error, Debug)]
pub enum SafekeeperHttpError {
@@ -52,7 +52,7 @@ impl ResponseErrorMessageExt for Response {
Err(SafekeeperHttpError::Response(
match self.json::<HttpErrorBody>() {
Ok(err_body) => format!("Error: {}", err_body.msg),
Err(_) => format!("Http error ({}) at {url}.", status.as_u16()),
Err(_) => format!("Http error ({}) at {}.", status.as_u16(), url),
},
))
}
@@ -65,7 +65,7 @@ impl ResponseErrorMessageExt for Response {
//
#[derive(Debug)]
pub struct SafekeeperNode {
pub id: NodeId,
pub id: ZNodeId,
pub conf: SafekeeperConf,
@@ -75,12 +75,16 @@ pub struct SafekeeperNode {
pub http_base_url: String,
pub pageserver: Arc<PageServerNode>,
broker_endpoints: Option<String>,
}
impl SafekeeperNode {
pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
let pageserver = Arc::new(PageServerNode::from_env(env));
println!("initializing for sk {} for {}", conf.id, conf.http_port);
SafekeeperNode {
id: conf.id,
conf: conf.clone(),
@@ -89,6 +93,7 @@ impl SafekeeperNode {
http_client: Client::new(),
http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
pageserver,
broker_endpoints: env.broker_endpoints.clone(),
}
}
@@ -100,7 +105,7 @@ impl SafekeeperNode {
.unwrap()
}
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
pub fn datadir_path_by_id(env: &LocalEnv, sk_id: ZNodeId) -> PathBuf {
env.safekeeper_data_dir(format!("sk{}", sk_id).as_ref())
}
@@ -135,27 +140,9 @@ impl SafekeeperNode {
if !self.conf.sync {
cmd.arg("--no-sync");
}
let comma_separated_endpoints = self.env.etcd_broker.comma_separated_endpoints();
if !comma_separated_endpoints.is_empty() {
cmd.args(&["--broker-endpoints", &comma_separated_endpoints]);
if let Some(ref ep) = self.broker_endpoints {
cmd.args(&["--broker-endpoints", ep]);
}
if let Some(prefix) = self.env.etcd_broker.broker_etcd_prefix.as_deref() {
cmd.args(&["--broker-etcd-prefix", prefix]);
}
if let Some(threads) = self.conf.backup_threads {
cmd.args(&["--backup-threads", threads.to_string().as_ref()]);
}
if let Some(ref remote_storage) = self.conf.remote_storage {
cmd.args(&["--remote-storage", remote_storage]);
}
if self.conf.auth_enabled {
cmd.arg("--auth-validation-public-key-path");
// PathBuf is better be passed as is, not via `String`.
cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
}
fill_aws_secrets_vars(&mut cmd);
if !cmd.status()?.success() {
bail!(
@@ -218,13 +205,12 @@ impl SafekeeperNode {
let pid = Pid::from_raw(pid);
let sig = if immediate {
print!("Stopping safekeeper {} immediately..", self.id);
println!("Stop safekeeper immediately");
Signal::SIGQUIT
} else {
print!("Stopping safekeeper {} gracefully..", self.id);
println!("Stop safekeeper gracefully");
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
@@ -246,37 +232,25 @@ impl SafekeeperNode {
// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if safekeeper flushes a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nSafekeeper connection failed with error: {err}");
}
}
}
if tcp_stopped {
// Also check status on the HTTP port
for _ in 0..100 {
if let Err(_e) = TcpStream::connect(&address) {
println!("Safekeeper stopped receiving connections");
//Now check status
match self.check_status() {
Err(SafekeeperHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
Ok(_) => {
println!("Safekeeper status is OK. Wait a bit.");
thread::sleep(Duration::from_secs(1));
}
Err(err) => {
eprintln!("\nSafekeeper status check failed with error: {err}");
println!("Safekeeper status is: {}", err);
return Ok(());
}
Ok(()) => {
// keep waiting
}
}
} else {
println!("Safekeeper still receives connections");
thread::sleep(Duration::from_secs(1));
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
}
bail!("Failed to stop safekeeper with pid {}", pid);
@@ -301,14 +275,15 @@ impl SafekeeperNode {
&self,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
peer_ids: Vec<NodeId>,
peer_ids: Vec<ZNodeId>,
) -> Result<()> {
Ok(self
.http_request(
Method::POST,
format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
format!("{}/{}", self.http_base_url, "timeline"),
)
.json(&TimelineCreateRequest {
tenant_id,
timeline_id,
peer_ids,
})

View File

@@ -1,8 +1,6 @@
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufReader, Write};
use std::io::Write;
use std::net::TcpStream;
use std::num::NonZeroU64;
use std::path::PathBuf;
use std::process::Command;
use std::time::Duration;
@@ -12,9 +10,8 @@ use anyhow::{bail, Context};
use nix::errno::Errno;
use nix::sys::signal::{kill, Signal};
use nix::unistd::Pid;
use pageserver::http::models::{
TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
};
use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
use pageserver::timelines::TimelineInfo;
use postgres::{Config, NoTls};
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
@@ -28,7 +25,8 @@ use utils::{
};
use crate::local_env::LocalEnv;
use crate::{fill_aws_secrets_vars, fill_rust_env_vars, read_pidfile};
use crate::{fill_rust_env_vars, read_pidfile};
use pageserver::tenant_mgr::TenantInfo;
#[derive(Error, Debug)]
pub enum PageserverHttpError {
@@ -39,12 +37,6 @@ pub enum PageserverHttpError {
Response(String),
}
impl From<anyhow::Error> for PageserverHttpError {
fn from(e: anyhow::Error) -> Self {
Self::Response(e.to_string())
}
}
type Result<T> = result::Result<T, PageserverHttpError>;
pub trait ResponseErrorMessageExt: Sized {
@@ -129,16 +121,6 @@ impl PageServerNode {
);
let listen_pg_addr_param =
format!("listen_pg_addr='{}'", self.env.pageserver.listen_pg_addr);
let broker_endpoints_param = format!(
"broker_endpoints=[{}]",
self.env
.etcd_broker
.broker_endpoints
.iter()
.map(|url| format!("'{url}'"))
.collect::<Vec<_>>()
.join(",")
);
let mut args = Vec::with_capacity(20);
args.push("--init");
@@ -147,19 +129,8 @@ impl PageServerNode {
args.extend(["-c", &authg_type_param]);
args.extend(["-c", &listen_http_addr_param]);
args.extend(["-c", &listen_pg_addr_param]);
args.extend(["-c", &broker_endpoints_param]);
args.extend(["-c", &id]);
let broker_etcd_prefix_param = self
.env
.etcd_broker
.broker_etcd_prefix
.as_ref()
.map(|prefix| format!("broker_etcd_prefix='{prefix}'"));
if let Some(broker_etcd_prefix_param) = broker_etcd_prefix_param.as_deref() {
args.extend(["-c", broker_etcd_prefix_param]);
}
for config_override in config_overrides {
args.extend(["-c", config_override]);
}
@@ -196,9 +167,6 @@ impl PageServerNode {
);
}
// echo the captured output of the init command
println!("{}", String::from_utf8_lossy(&init_output.stdout));
Ok(initial_timeline_id)
}
@@ -218,6 +186,8 @@ impl PageServerNode {
);
io::stdout().flush().unwrap();
let mut cmd = Command::new(self.env.pageserver_bin()?);
let repo_path = self.repo_path();
let mut args = vec!["-D", repo_path.to_str().unwrap()];
@@ -225,11 +195,9 @@ impl PageServerNode {
args.extend(["-c", config_override]);
}
let mut cmd = Command::new(self.env.pageserver_bin()?);
let mut filled_cmd = fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
filled_cmd = fill_aws_secrets_vars(filled_cmd);
fill_rust_env_vars(cmd.args(&args).arg("--daemonize"));
if !filled_cmd.status()?.success() {
if !cmd.status()?.success() {
bail!(
"Pageserver failed to start. See '{}' for details.",
self.repo_path().join("pageserver.log").display()
@@ -289,13 +257,12 @@ impl PageServerNode {
let pid = Pid::from_raw(read_pidfile(&pid_file)?);
let sig = if immediate {
print!("Stopping pageserver immediately..");
println!("Stop pageserver immediately");
Signal::SIGQUIT
} else {
print!("Stopping pageserver gracefully..");
println!("Stop pageserver gracefully");
Signal::SIGTERM
};
io::stdout().flush().unwrap();
match kill(pid, sig) {
Ok(_) => (),
Err(Errno::ESRCH) => {
@@ -317,38 +284,25 @@ impl PageServerNode {
// TODO Remove this "timeout" and handle it on caller side instead.
// Shutting down may take a long time,
// if pageserver checkpoints a lot of data
let mut tcp_stopped = false;
for i in 0..600 {
if !tcp_stopped {
if let Err(err) = TcpStream::connect(&address) {
tcp_stopped = true;
if err.kind() != io::ErrorKind::ConnectionRefused {
eprintln!("\nPageserver connection failed with error: {err}");
}
}
}
if tcp_stopped {
// Also check status on the HTTP port
for _ in 0..100 {
if let Err(_e) = TcpStream::connect(&address) {
println!("Pageserver stopped receiving connections");
//Now check status
match self.check_status() {
Err(PageserverHttpError::Transport(err)) if err.is_connect() => {
println!("done!");
return Ok(());
Ok(_) => {
println!("Pageserver status is OK. Wait a bit.");
thread::sleep(Duration::from_secs(1));
}
Err(err) => {
eprintln!("\nPageserver status check failed with error: {err}");
println!("Pageserver status is: {}", err);
return Ok(());
}
Ok(()) => {
// keep waiting
}
}
} else {
println!("Pageserver still receives connections");
thread::sleep(Duration::from_secs(1));
}
if i % 10 == 0 {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(Duration::from_millis(100));
}
bail!("Failed to stop pageserver with pid {}", pid);
@@ -401,7 +355,6 @@ impl PageServerNode {
.get("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()?,
checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.get("compaction_target_size")
.map(|x| x.parse::<u64>())
@@ -421,15 +374,6 @@ impl PageServerNode {
.map(|x| x.parse::<usize>())
.transpose()?,
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.get("walreceiver_connect_timeout")
.map(|x| x.to_string()),
lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
max_lsn_wal_lag: settings
.get("max_lsn_wal_lag")
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
})
.send()?
.error_from_body()?
@@ -453,42 +397,22 @@ impl PageServerNode {
tenant_id,
checkpoint_distance: settings
.get("checkpoint_distance")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
.map(|x| x.parse::<u64>().unwrap()),
compaction_target_size: settings
.get("compaction_target_size")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?,
.map(|x| x.parse::<u64>().unwrap()),
compaction_period: settings.get("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings
.get("compaction_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'compaction_threshold' as an integer")?,
.map(|x| x.parse::<usize>().unwrap()),
gc_horizon: settings
.get("gc_horizon")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'gc_horizon' as an integer")?,
.map(|x| x.parse::<u64>().unwrap()),
gc_period: settings.get("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings
.get("image_creation_threshold")
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
.map(|x| x.parse::<usize>().unwrap()),
pitr_interval: settings.get("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.get("walreceiver_connect_timeout")
.map(|x| x.to_string()),
lagging_wal_timeout: settings.get("lagging_wal_timeout").map(|x| x.to_string()),
max_lsn_wal_lag: settings
.get("max_lsn_wal_lag")
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
})
.send()?
.error_from_body()?;
@@ -532,54 +456,4 @@ impl PageServerNode {
Ok(timeline_info_response)
}
/// Import a basebackup prepared using either:
/// a) `pg_basebackup -F tar`, or
/// b) The `fullbackup` pageserver endpoint
///
/// # Arguments
/// * `tenant_id` - tenant to import into. Created if not exists
/// * `timeline_id` - id to assign to imported timeline
/// * `base` - (start lsn of basebackup, path to `base.tar` file)
/// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
pub fn timeline_import(
&self,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
base: (Lsn, PathBuf),
pg_wal: Option<(Lsn, PathBuf)>,
) -> anyhow::Result<()> {
let mut client = self.pg_connection_config.connect(NoTls).unwrap();
// Init base reader
let (start_lsn, base_tarfile_path) = base;
let base_tarfile = File::open(base_tarfile_path)?;
let mut base_reader = BufReader::new(base_tarfile);
// Init wal reader if necessary
let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
let wal_tarfile = File::open(wal_tarfile_path)?;
let wal_reader = BufReader::new(wal_tarfile);
(end_lsn, Some(wal_reader))
} else {
(start_lsn, None)
};
// Import base
let import_cmd =
format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
let mut writer = client.copy_in(&import_cmd)?;
io::copy(&mut base_reader, &mut writer)?;
writer.finish()?;
// Import wal if necessary
if let Some(mut wal_reader) = wal_reader {
let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
let mut writer = client.copy_in(&import_cmd)?;
io::copy(&mut wal_reader, &mut writer)?;
writer.finish()?;
}
Ok(())
}
}

View File

@@ -1,24 +1,13 @@
#!/bin/sh
set -eux
pageserver_id_param="${NODE_ID:-10}"
broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
if [ "$broker_endpoints_param" != "absent" ]; then
broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
else
broker_endpoints_param=''
fi
remote_storage_param="${REMOTE_STORAGE:-}"
if [ "$1" = 'pageserver' ]; then
if [ ! -d "/data/tenants" ]; then
echo "Initializing pageserver data directory"
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param
pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10"
fi
echo "Staring pageserver at 0.0.0.0:6400"
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -D /data
else
"$@"
fi

1
docs/.gitignore vendored
View File

@@ -1 +0,0 @@
book

14
docs/README.md Normal file
View File

@@ -0,0 +1,14 @@
# Zenith documentation
## Table of contents
- [authentication.md](authentication.md) — pageserver JWT authentication.
- [docker.md](docker.md) — Docker images and building pipeline.
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [sourcetree.md](sourcetree.md) — Overview of the source tree layeout.
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core

View File

@@ -1,82 +0,0 @@
# Summary
[Introduction]()
- [Separation of Compute and Storage](./separation-compute-storage.md)
# Architecture
- [Compute]()
- [WAL proposer]()
- [WAL Backpressure]()
- [Postgres changes](./core_changes.md)
- [Pageserver](./pageserver.md)
- [Services](./pageserver-services.md)
- [Thread management](./pageserver-thread-mgmt.md)
- [WAL Redo](./pageserver-walredo.md)
- [Page cache](./pageserver-pagecache.md)
- [Storage](./pageserver-storage.md)
- [Datadir mapping]()
- [Layer files]()
- [Branching]()
- [Garbage collection]()
- [Cloud Storage]()
- [Processing a GetPage request](./pageserver-processing-getpage.md)
- [Processing WAL](./pageserver-processing-wal.md)
- [Management API]()
- [Tenant Rebalancing]()
- [WAL Service](walservice.md)
- [Consensus protocol](safekeeper-protocol.md)
- [Management API]()
- [Rebalancing]()
- [Control Plane]()
- [Proxy]()
- [Source view](./sourcetree.md)
- [docker.md](./docker.md) — Docker images and building pipeline.
- [Error handling and logging]()
- [Testing]()
- [Unit testing]()
- [Integration testing]()
- [Benchmarks]()
- [Glossary](./glossary.md)
# Uncategorized
- [authentication.md](./authentication.md)
- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [settings.md](./settings.md)
#FIXME: move these under sourcetree.md
#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
#- [test_runner/README.md](/test_runner/README.md)
# RFCs
- [RFCs](./rfcs/README.md)
- [002-storage](rfcs/002-storage.md)
- [003-laptop-cli](rfcs/003-laptop-cli.md)
- [004-durability](rfcs/004-durability.md)
- [005-zenith_local](rfcs/005-zenith_local.md)
- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
- [008-push-pull](rfcs/008-push-pull.md)
- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
- [010-storage_details](rfcs/010-storage_details.md)
- [011-retention-policy](rfcs/011-retention-policy.md)
- [012-background-tasks](rfcs/012-background-tasks.md)
- [013-term-history](rfcs/013-term-history.md)
- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
- [014-storage-lsm](rfcs/014-storage-lsm.md)
- [015-storage-messaging](rfcs/015-storage-messaging.md)
- [016-connection-routing](rfcs/016-connection-routing.md)
- [cluster-size-limits](rfcs/cluster-size-limits.md)

View File

@@ -1,5 +0,0 @@
[book]
language = "en"
multilingual = false
src = "."
title = "Neon architecture"

View File

@@ -1,519 +1,202 @@
# Postgres core changes
1. Add t_cid to XLOG record
- Why?
The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
This lists all the changes that have been made to the PostgreSQL
source tree, as a somewhat logical set of patches. The long-term goal
is to eliminate all these changes, by submitting patches to upstream
and refactoring code into extensions, so that you can run unmodified
PostgreSQL against Neon storage.
To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore.
In Neon, we run PostgreSQL in the compute nodes, but we also run a special WAL redo process in the
page server. We currently use the same binary for both, with --wal-redo runtime flag to launch it in
the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for
the WAL redo process.
- Alternatives?
I don't know
In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the
smgr interface. Once all the core changes have been submitted to upstream or eliminated some other
way, the extension could live outside the postgres repository and build against vanilla PostgreSQL.
2. Add PD_WAL_LOGGED.
- Why?
Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function.
Below is a list of all the PostgreSQL source code changes, categorized into changes needed for
compute, and changes needed for the WAL redo process:
There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set.
# Changes for Compute node
- Discussion:
https://discord.com/channels/869525774699462656/882681420986851359
## Add t_cid to heap WAL records
- Alternatives:
Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead.
```
src/backend/access/heap/heapam.c | 26 +-
src/include/access/heapam_xlog.h | 6 +-
```
We have added a new t_cid field to heap WAL records. This changes the WAL record format, making Neon WAL format incompatible with vanilla PostgreSQL!
3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo.
- Why?
XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres.
### Problem we're trying to solve
- Alternatives?
No
The problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works in PostgreSQL, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares about it anymore. But with Neon, we rely on WAL replay to reconstruct the page, even while the original transaction is still running.
### How to get rid of the patch
4. Eliminate reporting of some warnings related with hint bits, for example
"page is not marked all-visible but visibility map bit is set in relation".
- Why?
Hint bit may be not WAL logged.
Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information.
- Alternative?
Always wal log any page changes.
### Alternatives
Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.
5. Maintain last written LSN.
- Why?
When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN
of WAL record performing last update of this pages. But we do not know it, because we do not have page.
We can use current WAL flush position, but in this case there is high probability that page server
will be blocked until this peace of WAL is delivered.
As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages,
but SMGR API doesn't provide such knowledge.
## ginfast.c
- Alternatives?
Maintain map of LSNs of evicted pages.
```
diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
index e0d9940946..2d964c02e9 100644
--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
memset(&sublist, 0, sizeof(GinMetaPageData));
makeSublist(index, collector->tuples, collector->ntuples, &sublist);
+ if (metadata->head != InvalidBlockNumber)
+ {
+ /*
+ * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
+ * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
+ * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
+ * will try to WAL-log an image of the page.
+ */
+ buffer = ReadBuffer(index, metadata->tail);
+ }
+
if (needWal)
XLogBeginInsert();
@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
data.prevTail = metadata->tail;
data.newRightlink = sublist.head;
- buffer = ReadBuffer(index, metadata->tail);
LockBuffer(buffer, GIN_EXCLUSIVE);
page = BufferGetPage(buffer);
```
The problem is explained in the comment above
6. Launching Postgres without WAL.
- Why?
According to Zenith architecture compute node is stateless. So when we are launching
compute node, we need to provide some dummy PG_DATADIR. Relation pages
can be requested on demand from page server. But Postgres still need some non-relational data:
control and configuration files, SLRUs,...
It is currently implemented using basebackup (do not mix with pg_basebackup) which is created
by pageserver. It includes in this tarball config/control files, SLRUs and required directories.
As far as pageserver do not have original (non-scattered) WAL segments, it includes in
this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment,
which redo field points to the end of wal. It allows to load checkpoint record in more or less
standard way with minimal changes of Postgres, but then some special handling is needed,
including restoring previous record position from zenith.signal file.
Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo)
to pass checks performed by XLogReader.
### How to get rid of the patch
- Alternatives?
We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record
in special way. But it may only increase number of changes in xlog.c
Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical
section or something.
7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended
- Why?
We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver.
So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE
which means that recovery for them is not needed.
Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images?
- Alternatives?
No
8. Enforce WAL logging of sequence updates.
- Why?
Due to performance reasons Postgres don't want to log each fetching of a value from a sequence,
so we pre-log a few fetches in advance. In the event of crash we can lose
(skip over) as many values as we pre-logged.
But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache
and we will get a gap in sequence values even without crash.
## Mark index builds that use buffer manager without logging explicitly
- Alternatives:
Do not try to preserve sequential order but avoid performance penalty.
```
src/backend/access/gin/gininsert.c | 7 +
src/backend/access/gist/gistbuild.c | 15 +-
src/backend/access/spgist/spginsert.c | 8 +-
also some changes in src/backend/storage/smgr/smgr.c
```
9. Treat unlogged tables as normal (permanent) tables.
- Why?
Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables).
But as far as compute node is stateless, we need to persist their data to storage node.
And it can only be done through the WAL.
When a GIN index is built, for example, it is built by inserting the entries into the index more or
less normally, but without WAL-logging anything. After the index has been built, we iterate through
all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged
and is evicted from the buffer cache, it is lost. We have an check to catch that in the Neon
extension. To fix that, we've added a few functions to track explicitly when we're performing such
an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` and
`smgr_end_unlogged_build`.
### How to get rid of the patch
I think it would make sense to be more explicit about that in PostgreSQL too. So extract these
changes to a patch and post to pgsql-hackers.
- Alternatives?
* Store unlogged tables locally (violates requirement of stateless compute nodes).
* Prohibit unlogged tables at all.
## Track last-written page LSN
10. Support start Postgres in wal-redo mode
- Why?
To be able to apply WAL record and reconstruct pages at page server.
```
src/backend/commands/dbcommands.c | 17 +-
- Alternatives?
* Rewrite redo handlers in Rust
* Do not reconstruct pages at page server at all and do it at compute node.
Also one call to SetLastWrittenPageLSN() in spginsert.c, maybe elsewhere too
```
Whenever a page is evicted from the buffer cache, we remember its LSN, so that we can use the same
LSN in the GetPage@LSN request when reading the page back from the page server. The value is
conservative: it would be correct to always use the last-inserted LSN, but it would be slow because
then the page server would need to wait for the recent WAL to be streamed and processed, before
responding to any GetPage@LSN request.
11. WAL proposer
- Why?
WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.
It is currently implemented as patch to standard WAL sender.
The last-written page LSN is mostly tracked in the smgrwrite() function, without core code changes,
but there are a few exceptions where we've had to add explicit calls to the Neon-specific
SetLastWrittenPageLSN() function.
- Alternatives?
Can be moved to extension if some extra callbacks will be added to wal sender code.
There's an open PR to track the LSN in a more-fine grained fashion:
https://github.com/neondatabase/postgres/pull/177
PostgreSQL v15 introduces a new method to do CREATE DATABASE that WAL-logs the database instead of
relying copying files and checkpoint. With that method, we probably won't need any special handling.
The old method is still available, though.
12. Secure Computing BPF API wrapper.
- Why?
Pageserver delegates complex WAL decoding duties to Postgres,
which means that the latter might fall victim to carefully designed
malicious WAL records and start doing harmful things to the system.
To prevent this, it has been decided to limit possible interactions
with the outside world using the Secure Computing BPF mode.
- Alternatives:
* Rewrite redo handlers in Rust.
* Add more checks to guarantee correctness of WAL records.
* Move seccomp.c to extension
* Many other discussed approaches to neutralize incorrect WAL records vulnerabilities.
13. Callbacks for replica feedbacks
- Why?
Allowing waproposer to interact with walsender code.
- Alternatives
Copy walsender code to walproposer.
14. Support multiple SMGR implementations.
- Why?
Postgres provides abstract API for storage manager but it has only one implementation
and provides no way to replace it with custom storage manager.
- Alternatives?
None.
15. Calculate database size as sum of all database relations.
- Why?
Postgres is calculating database size by traversing data directory
but as far as Zenith compute node is stateless we can not do it.
- Alternatives?
Send this request directly to pageserver and calculate real (physical) size
of Zenith representation of database/timeline, rather than sum logical size of all relations.
### How to get rid of the patch
Wait until v15?
-----------------------------------------------
Not currently committed but proposed:
1. Disable ring buffer buffer manager strategies
- Why?
Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
Even if there are free space in buffer cache, pages may be evicted.
Negative effect of it can be somehow compensated by file system cache, but in case of Zenith
cost of requesting page from page server is much higher.
## Cache relation sizes
- Alternatives?
Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
for example copy evicted page from ring buffer to some other buffer if there is free space
in buffer cache.
The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going
to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the
relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for
Neon)
2. Disable marking page as dirty when hint bits are set.
- Why?
Postgres has to modify page twice: first time when some tuple is updated and second time when
hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
- Alternatives?
Add special WAL record for setting page hints.
## Misc change in vacuumlazy.c
3. Prefetching
- Why?
As far as pages in Zenith are loaded on demand, to reduce node startup time
and also sppedup some massive queries we need some mechanism for bulk loading to
reduce page request round-trip overhead.
```
index 8aab6e324e..c684c4fbee 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
else if (all_visible_according_to_vm && !PageIsAllVisible(page)
&& VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
{
- elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+ /* ZENITH-XXX: all visible hint is not wal-logged
+ * FIXME: Replay visibilitymap changes in pageserver
+ */
+ elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
vacrel->relname, blkno);
visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
VISIBILITYMAP_VALID_BITS);
```
Currently Postgres is supporting prefetching only for bitmap scan.
In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch
some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs.
Is this still needed? If that WARNING happens, it looks like potential corruption that we should
fix!
## Use buffer manager when extending VM or FSM
```
src/backend/storage/freespace/freespace.c | 14 +-
src/backend/access/heap/visibilitymap.c | 15 +-
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index e198df65d8..addfe93eac 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
/* Now extend the file */
while (vm_nblocks_now < vm_nblocks)
{
- PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
+ /*
+ * ZENITH: Initialize VM pages through buffer cache to prevent loading
+ * them from pageserver.
+ */
+ Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
+ RBM_ZERO_AND_LOCK, NULL);
+ Page page = BufferGetPage(buffer);
+
+ PageInit((Page) page, BLCKSZ, 0);
+ PageSetChecksumInplace(page, vm_nblocks_now);
+ MarkBufferDirty(buffer);
+ UnlockReleaseBuffer(buffer);
- smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
- pg.data, false);
vm_nblocks_now++;
}
```
### Problem we're trying to solve
???
### How to get rid of the patch
Maybe this would be a reasonable change in PostgreSQL too?
## Allow startup without reading checkpoint record
In Neon, the compute node is stateless. So when we are launching compute node, we need to provide
some dummy PG_DATADIR. Relation pages can be requested on demand from page server. But Postgres
still need some non-relational data: control and configuration files, SLRUs,... It is currently
implemented using basebackup (do not mix with pg_basebackup) which is created by pageserver. It
includes in this tarball config/control files, SLRUs and required directories.
As pageserver does not have the original WAL segments, the basebackup tarball includes an empty WAL
segment to bootstrap the WAL writing, but it doesn't contain the checkpoint record. There are some
changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
from WAL.
This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
### How to get rid of the patch
???
### Alternatives
Include a fake checkpoint record in the tarball. Creating fake WAL is a bit risky, though; I'm
afraid it might accidentally get streamed to the safekeepers and overwrite or corrupt the real WAL.
## Disable sequence caching
```
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 0415df9ccb..9f9db3c8bc 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -53,7 +53,9 @@
* so we pre-log a few fetches in advance. In the event of
* crash we can lose (skip over) as many values as we pre-logged.
*/
-#define SEQ_LOG_VALS 32
+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* #define SEQ_LOG_VALS 32 */
+#define SEQ_LOG_VALS 0
```
Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, so
it pre-logs a few fetches in advance. In the event of crash we can lose (skip over) as many values
as we pre-logged. But with Neon, because page with sequence value can be evicted from buffer cache,
we can get a gap in sequence values even without crash.
### How to get rid of the patch
Maybe we can just remove it, and accept the gaps. Or add some special handling for sequence
relations in the Neon extension, to WAL log the sequence page when it's about to be evicted. It
would be weird if the sequence moved backwards though, think of PITR.
Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.
## Walproposer
```
src/Makefile | 1 +
src/backend/replication/libpqwalproposer/Makefile | 37 +
src/backend/replication/libpqwalproposer/libpqwalproposer.c | 416 ++++++++++++
src/backend/postmaster/bgworker.c | 4 +
src/backend/postmaster/postmaster.c | 6 +
src/backend/replication/Makefile | 4 +-
src/backend/replication/walproposer.c | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
src/backend/replication/walproposer_utils.c | 402 +++++++++++
src/backend/replication/walreceiver.c | 7 +
src/backend/replication/walsender.c | 320 ++++++---
src/backend/storage/ipc/ipci.c | 6 +
src/include/replication/walproposer.h | 565 ++++++++++++++++
```
WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes. It is
currently implemented as patch to standard WAL sender.
### How to get rid of the patch
Refactor into an extension. Submit hooks or APIs into upstream if necessary.
@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96
## Ignore unexpected data beyond EOF in bufmgr.c
```
@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
*/
bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
if (!PageIsNew((Page) bufBlock))
- ereport(ERROR,
+ {
+ // XXX-ZENITH
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+ ereport(DEBUG1,
(errmsg("unexpected data beyond EOF in block %u of relation %s",
blockNum, relpath(smgr->smgr_rnode, forkNum)),
errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
-
+ }
/*
* We *must* do smgrextend before succeeding, else the page will not
* be reserved by the kernel, and the next P_NEW call will decide to
```
PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros
first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend
a relation at the same time, the pages can be WAL-logged in different order.
I'm not sure what scenario exactly required this change in Neon, though.
### How to get rid of the patch
Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit
confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation,
and finally WAL-log that the extension succeeded.
## Make smgr interface available to extensions
```
src/backend/storage/smgr/smgr.c | 203 +++---
src/include/storage/smgr.h | 72 +-
```
### How to get rid of the patch
Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression.
## Added relpersistence argument to smgropen()
```
src/backend/access/heap/heapam_handler.c | 2 +-
src/backend/catalog/storage.c | 10 +-
src/backend/commands/tablecmds.c | 2 +-
src/backend/storage/smgr/md.c | 4 +-
src/include/utils/rel.h | 3 +-
```
Neon needs to treat unlogged relations differently from others, so the smgrread(), smgrwrite() etc.
implementations need to know the 'relpersistence' of the relation. To get that information where
it's needed, we added the 'relpersistence' field to smgropen().
### How to get rid of the patch
Maybe 'relpersistence' would be useful in PostgreSQL for debugging purposes? Or simply for the
benefit of extensions like Neon. Should consider this in the patch to make smgr API usable to
extensions.
## Alternatives
Currently in Neon, unlogged tables live on local disk in the compute node, and are wiped away on
compute node restart. One alternative would be to instead WAL-log even unlogged tables, essentially
ignoring the UNLOGGED option. Or prohibit UNLOGGED tables completely. But would we still need the
relpersistence argument to handle index builds? See item on "Mark index builds that use buffer
manager without logging explicitly".
## Use smgr and dbsize_hook for size calculations
```
src/backend/utils/adt/dbsize.c | 61 +-
```
In PostgreSQL, the rel and db-size functions scan the data directory directly. That won't work in Neon.
### How to get rid of the patch
Send patch to PostgreSQL, to use smgr API functions for relation size calculation instead. Maybe as
part of the general smgr API patch.
# WAL redo process changes
Pageserver delegates complex WAL decoding duties to Postgres, which means that the latter might fall
victim to carefully designed malicious WAL records and start doing harmful things to the system. To
prevent this, the redo functions are executed in a separate process that is sandboxed with Linux
Secure Computing mode (see seccomp(2) man page).
As an alternative to having a separate WAL redo process, we could rewrite all redo handlers in Rust
This is infeasible. However, it would take a lot of effort to rewrite them, ensure that you've done
the rewrite correctly, and once you've done that, it would be a lot of ongoing maintenance effort to
keep the rewritten code in sync over time, across new PostgreSQL versions. That's why we want to
leverage PostgreSQL code.
Another alternative would be to harden all the PostgreSQL WAL redo functions so that it would be
safe to call them directly from Rust code, without needing the security sandbox. That's not feasible
for similar reasons as rewriting them in Rust.
## Don't replay change in XLogReadBufferForRedo that are not for the target page we're replaying
```
src/backend/access/gin/ginxlog.c | 19 +-
Also some changes in xlog.c and xlogutils.c
Example:
@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record)
if (!isLeaf)
ginRedoClearIncompleteSplit(record, 3);
- if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+ action = XLogReadBufferForRedo(record, 0, &lbuffer);
+ if (action != BLK_RESTORED && action != BLK_DONE)
elog(ERROR, "GIN split record did not contain a full-page image of left page");
```
### Problem we're trying to solve
In PostgreSQL, if a WAL redo function calls XLogReadBufferForRead() for a page that has a full-page
image, it always succeeds. However, Neon WAL redo process is only concerned about replaying changes
to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified
XLogReadBufferForRead() to return BLK_DONE for all other pages, to avoid the overhead. That is
unexpected by code like the above.
### How to get rid of the patch
Submit the changes to upstream, hope the community accepts them. There's no harm to PostgreSQL from
these changes, although it doesn't have any benefit either.
To make these changes useful to upstream PostgreSQL, we could implement a feature to look ahead the
WAL, and detect truncated relations. Even in PostgreSQL, it is a waste of cycles to replay changes
to pages that are later truncated away, so we could have XLogReadBufferForRedo() return BLK_DONE or
BLK_NOTFOUND for pages that are known to be truncated away later in the WAL stream.
### Alternatives
Maybe we could revert this optimization, and restore pages other than the target page too.
## Add predefined_sysidentifier flag to initdb
```
src/backend/bootstrap/bootstrap.c | 13 +-
src/bin/initdb/initdb.c | 4 +
And some changes in xlog.c
```
This is used to help with restoring a database when you have all the WAL, all the way back to
initdb, but no backup. You can reconstruct the missing backup by running initdb again, with the same
sysidentifier.
### How to get rid of the patch
Ignore it. This is only needed for disaster recovery, so once we've eliminated all other Postgres
patches, we can just keep it around as a patch or as separate branch in a repo.
# Not currently committed but proposed
## Disable ring buffer buffer manager strategies
### Why?
Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
Even if there are free space in buffer cache, pages may be evicted.
Negative effect of it can be somehow compensated by file system cache, but in Neon,
cost of requesting page from page server is much higher.
### Alternatives?
Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
for example copy evicted page from ring buffer to some other buffer if there is free space
in buffer cache.
## Disable marking page as dirty when hint bits are set.
### Why?
Postgres has to modify page twice: first time when some tuple is updated and second time when
hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
### Alternatives?
Add special WAL record for setting page hints.
## Prefetching
### Why?
As far as pages in Neon are loaded on demand, to reduce node startup time
and also speedup some massive queries we need some mechanism for bulk loading to
reduce page request round-trip overhead.
Currently Postgres is supporting prefetching only for bitmap scan.
In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us.
For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages
of heap relation addressed by TIDs.
## Prewarming
### Why?
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
We can capture state of compute node buffer cache and send bulk request for this pages at startup.
4. Prewarming.
- Why?
Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
We can capture state of compute node buffer cache and send bulk request for this pages at startup.

View File

@@ -1,20 +1,20 @@
# Docker images of Neon
# Docker images of Zenith
## Images
Currently we build two main images:
- [neondatabase/neon](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [neondatabase/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres).
- [zenithdb/zenith](https://hub.docker.com/repository/docker/zenithdb/zenith) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
- [zenithdb/compute-node](https://hub.docker.com/repository/docker/zenithdb/compute-node) — compute node image with pre-built Postgres binaries from [zenithdb/postgres](https://github.com/zenithdb/postgres).
And additional intermediate image:
And additional intermediate images:
- [neondatabase/compute-tools](https://hub.docker.com/repository/docker/neondatabase/compute-tools) — compute node configuration management tools.
- [zenithdb/compute-tools](https://hub.docker.com/repository/docker/zenithdb/compute-tools) — compute node configuration management tools.
## Building pipeline
We build all images after a successful `release` tests run and push automatically to Docker Hub with two parallel CI jobs
1. Image `zenithdb/compute-tools` is re-built automatically.
1. `neondatabase/compute-tools` and `neondatabase/compute-node`
2. Image `zenithdb/compute-node` is built independently in the [zenithdb/postgres](https://github.com/zenithdb/postgres) repo.
2. `neondatabase/neon`
3. Image `zenithdb/zenith` is built in this repo after a successful `release` tests run and pushed to Docker Hub automatically.

View File

@@ -2,7 +2,7 @@
### Authentication
### Backpressure
### Backpresssure
Backpressure is used to limit the lag between pageserver and compute node or WAL service.
@@ -21,7 +21,7 @@ NOTE:It has nothing to do with PostgreSQL pg_basebackup.
### Branch
We can create branch at certain LSN using `neon_local timeline branch` command.
We can create branch at certain LSN using `zenith timeline branch` command.
Each Branch lives in a corresponding timeline[] and has an ancestor[].
@@ -75,7 +75,7 @@ layer's Segment and range of LSNs.
There are two kinds of layers, in-memory and on-disk layers. In-memory
layers are used to ingest incoming WAL, and provide fast access
to the recent page versions. On-disk layers are stored as files on disk, and
are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more.
are immutable. See pageserver/src/layered_repository/README.md for more.
### Layer file (on-disk layer)
@@ -91,7 +91,7 @@ The layer map tracks what layers exist in a timeline.
### Layered repository
Neon repository implementation that keeps data in layers.
Zenith repository implementation that keeps data in layers.
### LSN
The Log Sequence Number (LSN) is a unique identifier of the WAL record[] in the WAL log.
@@ -101,7 +101,7 @@ It is printed as two hexadecimal numbers of up to 8 digits each, separated by a
Check also [PostgreSQL doc about pg_lsn type](https://www.postgresql.org/docs/devel/datatype-pg-lsn.html)
Values can be compared to calculate the volume of WAL data that separates them, so they are used to measure the progress of replication and recovery.
In Postgres and Neon LSNs are used to describe certain points in WAL handling.
In postgres and Zenith lsns are used to describe certain points in WAL handling.
PostgreSQL LSNs and functions to monitor them:
* `pg_current_wal_insert_lsn()` - Returns the current write-ahead log insert location.
@@ -111,13 +111,13 @@ PostgreSQL LSNs and functions to monitor them:
* `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
[source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):
Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information.
Zenith safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
* `CommitLSN`: position in WAL confirmed by quorum safekeepers.
* `RestartLSN`: position in WAL confirmed by all safekeepers.
* `FlushLSN`: part of WAL persisted to the disk by safekeeper.
* `VCL`: the largest LSN for which we can guarantee availability of all prior records.
* `VCL`: the largerst LSN for which we can guarantee availablity of all prior records.
Neon pageserver LSNs:
Zenith pageserver LSNs:
* `last_record_lsn` - the end of last processed WAL record.
* `disk_consistent_lsn` - data is known to be fully flushed and fsync'd to local disk on pageserver up to this LSN.
* `remote_consistent_lsn` - The last LSN that is synced to remote storage and is guaranteed to survive pageserver crash.
@@ -132,7 +132,7 @@ This is the unit of data exchange between compute node and pageserver.
### Pageserver
Neon storage engine: repositories + wal receiver + page service + wal redo.
Zenith storage engine: repositories + wal receiver + page service + wal redo.
### Page service
@@ -184,10 +184,10 @@ relation exceeds that size, it is split into multiple segments.
SLRUs include pg_clog, pg_multixact/members, and
pg_multixact/offsets. There are other SLRUs in PostgreSQL, but
they don't need to be stored permanently (e.g. pg_subtrans),
or we do not support them in neon yet (pg_commit_ts).
or we do not support them in zenith yet (pg_commit_ts).
### Tenant (Multitenancy)
Tenant represents a single customer, interacting with Neon.
Tenant represents a single customer, interacting with Zenith.
Wal redo[] activity, timelines[], layers[] are managed for each tenant independently.
One pageserver[] can serve multiple tenants at once.
One safekeeper

View File

@@ -6,7 +6,7 @@ Zenith supports multitenancy. One pageserver can serve multiple tenants at once.
### Tenants in other commands
By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct argument `--tenantid=<tenantid>` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants.
By default during `zenith init` new tenant is created on the pageserver. Newly created tenant's id is saved to cli config, so other commands can use it automatically if no direct arugment `--tenantid=<tenantid>` is provided. So generally tenantid more frequently appears in internal pageserver interface. Its commands take tenantid argument to distinguish to which tenant operation should be applied. CLI support creation of new tenants.
Examples for cli:

View File

@@ -1,9 +0,0 @@
# Page Service
The Page Service listens for GetPage@LSN requests from the Compute Nodes,
and responds with pages from the repository. On each GetPage@LSN request,
it calls into the Repository function
A separate thread is spawned for each incoming connection to the page
service. The page service uses the libpq protocol to communicate with
the client. The client is a Compute Postgres instance.

View File

@@ -1,8 +0,0 @@
# Page cache
TODO:
- shared across tenants
- store pages from layer files
- store pages from "in-memory layer"
- store materialized pages

View File

@@ -1,4 +0,0 @@
# Processing a GetPage request
TODO:
- sequence diagram that shows how a GetPage@LSN request is processed

View File

@@ -1,5 +0,0 @@
# Processing WAL
TODO:
- diagram that shows how incoming WAL is processed
- explain durability, what is fsync'd when, disk_consistent_lsn

View File

@@ -1,26 +0,0 @@
## Thread management
Each thread in the system is tracked by the `thread_mgr` module. It
maintains a registry of threads, and which tenant or timeline they are
operating on. This is used for safe shutdown of a tenant, or the whole
system.
### Handling shutdown
When a tenant or timeline is deleted, we need to shut down all threads
operating on it, before deleting the data on disk. A thread registered
in the thread registry can check if it has been requested to shut down,
by calling `is_shutdown_requested()`. For async operations, there's also
a `shudown_watcher()` async task that can be used to wake up on shutdown.
### Sync vs async
The primary programming model in the page server is synchronous,
blocking code. However, there are some places where async code is
used. Be very careful when mixing sync and async code.
Async is primarily used to wait for incoming data on network
connections. For example, all WAL receivers have a shared thread pool,
with one async Task for each connection. Once a piece of WAL has been
received from the network, the thread calls the blocking functions in
the Repository to process the WAL.

View File

@@ -1,77 +0,0 @@
# WAL Redo
To reconstruct a particular page version from an image of the page and
some WAL records, the pageserver needs to replay the WAL records. This
happens on-demand, when a GetPage@LSN request comes in, or as part of
background jobs that reorganize data for faster access.
It's important that data cannot leak from one tenant to another, and
that a corrupt WAL record on one timeline doesn't affect other tenants
or timelines.
## Multi-tenant security
If you have direct access to the WAL directory, or if you have
superuser access to a running PostgreSQL server, it's easy to
construct a malicious or corrupt WAL record that causes the WAL redo
functions to crash, or to execute arbitrary code. That is not a
security problem for PostgreSQL; if you have superuser access, you
have full access to the system anyway.
The Neon pageserver, however, is multi-tenant. It needs to execute WAL
belonging to different tenants in the same system, and malicious WAL
in one tenant must not affect other tenants.
A separate WAL redo process is launched for each tenant, and the
process uses the seccomp(2) system call to restrict its access to the
bare minimum needed to replay WAL records. The process does not have
access to the filesystem or network. It can only communicate with the
parent pageserver process through a pipe.
If an attacker creates a malicious WAL record and injects it into the
WAL stream of a timeline, he can take control of the WAL redo process
in the pageserver. However, the WAL redo process cannot access the
rest of the system. And because there is a separate WAL redo process
for each tenant, the hijacked WAL redo process can only see WAL and
data belonging to the same tenant, which the attacker would have
access to anyway.
## WAL-redo process communication
The WAL redo process runs the 'postgres' executable, launched with a
Neon-specific command-line option to put it into WAL-redo process
mode. The pageserver controls the lifetime of the WAL redo processes,
launching them as needed. If a tenant is detached from the pageserver,
any WAL redo processes for that tenant are killed.
The pageserver communicates with each WAL redo process over its
stdin/stdout/stderr. It works in request-response model with a simple
custom protocol, described in walredo.rs. To replay a set of WAL
records for a page, the pageserver sends the "before" image of the
page and the WAL records over 'stdin', followed by a command to
perform the replay. The WAL redo process responds with an "after"
image of the page.
## Special handling of some records
Some WAL record types are handled directly in the pageserver, by
bespoken Rust code, and are not sent over to the WAL redo process.
This includes SLRU-related WAL records, like commit records. SLRUs
don't use the standard Postgres buffer manager, so dealing with them
in the Neon WAL redo mode would require quite a few changes to
Postgres code and special handling in the protocol anyway.
Some record types that include a full-page-image (e.g. XLOG_FPI) are
also handled specially when incoming WAL is processed already, and are
stored as page images rather than WAL records.
## Records that modify multiple pages
Some Postgres WAL records modify multiple pages. Such WAL records are
duplicated, so that a copy is stored for each affected page. This is
somewhat wasteful, but because most WAL records only affect one page,
the overhead is acceptable.
The WAL redo always happens for one particular page. If the WAL record
coantains changes to other pages, they are ignored.

View File

@@ -1,11 +0,0 @@
# Page server architecture
The Page Server has a few different duties:
- Respond to GetPage@LSN requests from the Compute Nodes
- Receive WAL from WAL safekeeper, and store it
- Upload data to S3 to make it durable, download files from S3 as needed
S3 is the main fault-tolerant storage of all data, as there are no Page Server
replicas. We use a separate fault-tolerant WAL service to reduce latency. It
keeps track of WAL records which are not synced to S3 yet.

View File

@@ -77,7 +77,7 @@ Upon storage node restart recent WAL files are applied to appropriate pages and
### **Checkpointing**
No such mechanism is needed. Or we may look at the storage node as at kind of continuous checkpointer.
No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer.
### **Full page writes (torn page protection)**
@@ -111,13 +111,13 @@ Since we are storing page diffs of variable sizes there is no structural depende
### **Chunk metadata**
Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunk should always consult this data when merging SSTables and applying delete markers.
Chunk metadata is a file lies in chunk directory that stores info about current snapshots and PITR regions. Chunck should always consult this data when merging SSTables and applying delete markers.
### **Chunk splitting**
*(NB: following paragraph is about how to avoid page splitting)*
When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global metadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following:
When chunks hits some soft storage limit (let's say 100Gb) it should be split in half and global matadata about chunk boundaries should be updated. Here i assume that chunk split is a local operation happening on single node. Process of chink splitting should look like following:
1. Find separation key and spawn two new chunks with [lo, mid) [mid, hi) boundaries.
@@ -166,7 +166,7 @@ Multi-tenant storage makes sense even on a laptop, when you work with different
Few databases are stored in one chunk, replicated three times
- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we always may manually move chunks around the cluster.
- When database can't fit into one storage node it can occupy lots of chunks that were split while database was growing. Chunk placement on nodes is controlled by us with some automatization, but we alway may manually move chunks around the cluster.
<img width="940" alt="Screenshot_2021-02-22_at_16 49 10" src="https://user-images.githubusercontent.com/284219/108729815-fb071e00-753b-11eb-86e0-be6703e47d82.png">

View File

@@ -123,7 +123,7 @@ Show currently attached storages. For example:
> zenith storage list
NAME USED TYPE OPTIONS PATH
local 5.1G zenith-local /opt/zenith/store/local
local.compr 20.4G zenith-local compression=on /opt/zenith/store/local.compr
local.compr 20.4G zenith-local comression=on /opt/zenith/store/local.compr
zcloud 60G zenith-remote zenith.tech/stas/mystore
s3tank 80G S3
```
@@ -136,9 +136,9 @@ s3tank 80G S3
## pg
Manages postgres data directories and can start postgres instances with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themselves.
Manages postgres data directories and can start postgreses with proper configuration. An experienced user may avoid using that (except pg create) and configure/run postgres by themself.
Pg is a term for a single postgres running on some data. I'm trying to avoid separation of datadir management and postgres instance management -- both that concepts bundled here together.
Pg is a term for a single postgres running on some data. I'm trying to avoid here separation of datadir management and postgres instance management -- both that concepts bundled here together.
**zenith pg create** [--no-start --snapshot --cow] -s storage-name -n pgdata

View File

@@ -22,7 +22,7 @@ In addition to the WAL safekeeper nodes, the WAL is archived in
S3. WAL that has been archived to S3 can be removed from the
safekeepers, so the safekeepers don't need a lot of disk space.
```
+----------------+
+-----> | WAL safekeeper |
| +----------------+
@@ -42,23 +42,23 @@ safekeepers, so the safekeepers don't need a lot of disk space.
\
\
\
\ +--------+
\ | |
+------> | S3 |
| |
+--------+
\ +--------+
\ | |
+--> | S3 |
| |
+--------+
```
Every WAL safekeeper holds a section of WAL, and a VCL value.
The WAL can be divided into three portions:
```
VCL LSN
| |
V V
.................ccccccccccccccccccccXXXXXXXXXXXXXXXXXXXXXXX
Archived WAL Completed WAL In-flight WAL
```
Note that all this WAL kept in a safekeeper is a contiguous section.
This is different from Aurora: In Aurora, there can be holes in the

View File

@@ -31,7 +31,7 @@ Ideally, just one binary that incorporates all elements we need.
#### Components:
- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responses to show them in a user-friendly way.
- **zenith-CLI** - interface for end-users. Turns commands to REST requests and handles responces to show them in a user-friendly way.
CLI proposal is here https://github.com/libzenith/rfcs/blob/003-laptop-cli.md/003-laptop-cli.md
WIP code is here: https://github.com/libzenith/postgres/tree/main/pageserver/src/bin/cli

View File

@@ -25,9 +25,9 @@ To make changes in the catalog you need to run compute nodes
zenith start /home/pipedpiper/northwind:main -- starts a compute instance
zenith start zenith://zenith.tech/northwind:main -- starts a compute instance in the cloud
-- you can start a compute node against any hash or branch
zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start another compute instance (on different port)
zenith start /home/pipedpiper/northwind:experimental --port 8008 -- start anothe compute instance (on different port)
-- you can start a compute node against any hash or branch
zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start another compute instance (on different port)
zenith start /home/pipedpiper/northwind:<hash> --port 8009 -- start anothe compute instance (on different port)
-- After running some DML you can run
-- zenith status and see how there are two WAL streams one on top of

View File

@@ -121,7 +121,7 @@ repository, launch an instance on the same branch in both clones, and
later try to push/pull between them? Perhaps create a new timeline
every time you start up an instance? Then you would detect that the
timelines have diverged. That would match with the "epoch" concept
that we have in the WAL safekeeper
that we have in the WAL safekeepr
### zenith checkout/commit

View File

@@ -2,9 +2,9 @@ While working on export/import commands, I understood that they fit really well
We may think about backups as snapshots in a different format (i.e plain pgdata format, basebackup tar format, WAL-G format (if they want to support it) and so on). They use same storage API, the only difference is the code that packs/unpacks files.
Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postgres to zenith.
Even if zenith aims to maintains durability using it's own snapshots, backups will be useful for uploading data from postges to zenith.
So here is an attempt to design consistent CLI for different usage scenarios:
So here is an attemt to design consistent CLI for diferent usage scenarios:
#### 1. Start empty pageserver.
That is what we have now.

View File

@@ -3,7 +3,7 @@
GetPage@LSN can be called with older LSNs, and the page server needs
to be able to reconstruct older page versions. That's needed for
having read-only replicas that lag behind the primary, or that are
"anchored" at an older LSN, and internally in the page server when you
"anchored" at an older LSN, and internally in the page server whne you
branch at an older point in time. How do you do that?
For now, I'm not considering incremental snapshots at all. I don't
@@ -192,7 +192,7 @@ for a particular relation readily available alongside the snapshot
files, and you don't need to track what snapshot LSNs exist
separately.
(If we wanted to minimize the number of files, you could include the
(If we wanted to minize the number of files, you could include the
snapshot @300 and the WAL between 200 and 300 in the same file, but I
feel it's probably better to keep them separate)

View File

@@ -121,7 +121,7 @@ The properties of s3 that we depend on are:
list objects
streaming read of entire object
read byte range from object
streaming write new object (may use multipart upload for better reliability)
streaming write new object (may use multipart upload for better relialibity)
delete object (that should not disrupt an already-started read).
Uploaded files, restored backups, or s3 buckets controlled by users could contain malicious content. We should always validate that objects contain the content theyre supposed to. Incorrect, Corrupt or malicious-looking contents should cause software (cloud tools, pageserver) to fail gracefully.

View File

@@ -40,7 +40,7 @@ b) overwrite older pages with the newer pages -- if there is no replica we proba
I imagine that newly created pages would just be added to the back of PageStore (again in queue-like fashion) and this way there wouldn't be any meaningful ordering inside of that queue. When we are forming a new incremental snapshot we may prohibit any updates to the current set of pages in PageStore (giving up on single page version rule) and cut off that whole set when snapshot creation is complete.
With option b) we can also treat PageStor as an uncompleted incremental snapshot.
With option b) we can also treat PageStor as an uncompleted increamental snapshot.
### LocalStore
@@ -123,7 +123,7 @@ As far as I understand Bookfile/Aversion addresses versioning and serialization
As for exact data that should go to snapshots I think it is the following for each snapshot:
* format version number
* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknown key are present. If we add something backward compatible to the file we can keep the version number.
* set of key/values to interpret content (e.g. is page compression enabled, is that a full or incremental snapshot, previous snapshot id, is there WAL at the end on file, etc) -- it is up to a reader to decide what to do if some keys are missing or some unknow key are present. If we add something backward compatible to the file we can keep the version number.
* array of [BuffTag, corresponding offset in file] for pages -- IIUC that is analogous to ToC in Bookfile
* array of [(BuffTag, LSN), corresponding offset in file] for the WAL records
* pages, one by one
@@ -131,7 +131,7 @@ As for exact data that should go to snapshots I think it is the following for ea
It is also important to be able to load metadata quickly since it would be one of the main factors impacting the time of page server start. E.g. if would store/cache about 10TB of data per page server, the size of uncompressed page references would be about 30GB (10TB / ( 8192 bytes page size / ( ~18 bytes per ObjectTag + 8 bytes offset in the file))).
1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when relation_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset deltas would be small).
1) Since our ToC/array of entries can be sorted by ObjectTag we can store the whole BufferTag only when realtion_id is changed and store only delta-encoded offsets for a given relation. That would reduce the average per-page metadata size to something less than 4 bytes instead of 26 (assuming that pages would follow the same order and offset delatas would be small).
2) It makes sense to keep ToC at the beginning of the file to avoid extra seeks to locate it. Doesn't matter too much with the local files but matters on S3 -- if we are accessing a lot of ~1Gb files with the size of metadata ~ 1Mb then the time to transfer this metadata would be comparable with access latency itself (which is about a half of a second). So by slurping metadata with one read of file header instead of N reads we can improve the speed of page server start by this N factor.
I think both of that optimizations can be done later, but that is something to keep in mind when we are designing our storage serialization routines.

View File

@@ -7,13 +7,13 @@ and e.g. prevents electing two proposers with the same term -- it is actually
called `term` in the code. The second, called `epoch`, reflects progress of log
receival and this might lag behind `term`; safekeeper switches to epoch `n` when
it has received all committed log records from all `< n` terms. This roughly
corresponds to proposed in
correspones to proposed in
https://github.com/zenithdb/rfcs/pull/3/files
This makes our biggest our difference from Raft. In Raft, every log record is
stamped with term in which it was generated; while we essentially store in
stamped with term in which it was generated; while we essentialy store in
`epoch` only the term of the highest record on this safekeeper -- when we know
it -- because during recovery generally we don't, and `epoch` is bumped directly
to the term of the proposer who performs the recovery when it is finished. It is

View File

@@ -124,7 +124,7 @@ Each storage node can subscribe to the relevant sets of keys and maintain a loca
### Safekeeper address discovery
During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertise something more useful.
During the startup safekeeper should publish the address he is listening on as the part of `{"sk_#{sk_id}" => ip_address}`. Then the pageserver can resolve `sk_#{sk_id}` to the actual address. This way it would work both locally and in the cloud setup. Safekeeper should have `--advertised-address` CLI option so that we can listen on e.g. 0.0.0.0 but advertize something more useful.
### Safekeeper behavior
@@ -195,7 +195,7 @@ sequenceDiagram
PS1->>SK1: start replication
```
#### Behaviour of services during typical operations
#### Behavour of services during typical operations
```mermaid
sequenceDiagram
@@ -250,7 +250,7 @@ sequenceDiagram
PS2->>M: Register downloaded timeline
PS2->>M: Get safekeepers for timeline, subscribe to changes
PS2->>SK1: Start replication to catch up
note over O: PS2 caught up, time to switch compute
note over O: PS2 catched up, time to switch compute
O->>C: Restart compute with new pageserver url in config
note over C: Wal push is restarted
loop request pages

View File

@@ -1,151 +0,0 @@
# Dispatching a connection
For each client connection, Neon service needs to authenticate the
connection, and route it to the right PostgreSQL instance.
## Authentication
There are three different ways to authenticate:
- anonymous; no authentication needed
- PostgreSQL authentication
- github single sign-on using browser
In anonymous access, the user doesn't need to perform any
authentication at all. This can be used e.g. in interactive PostgreSQL
documentation, allowing you to run the examples very quickly. Similar
to sqlfiddle.com.
PostgreSQL authentication works the same as always. All the different
PostgreSQL authentication options like SCRAM, kerberos, etc. are
available. [1]
The third option is to authenticate with github single sign-on. When
you open the connection in psql, you get a link that you open with
your browser. Opening the link redirects you to github authentication,
and lets the connection to proceed. This is also known as "Link auth" [2].
## Routing the connection
When a client starts a connection, it needs to be routed to the
correct PostgreSQL instance. Routing can be done by the proxy, acting
as a man-in-the-middle, or the connection can be routed at the network
level based on the hostname or IP address.
Either way, Neon needs to identify which PostgreSQL instance the
connection should be routed to. If the instance is not already
running, it needs to be started. Some connections always require a new
PostgreSQL instance to be created, e.g. if you want to run a one-off
query against a particular point-in-time.
The PostgreSQL instance is identified by:
- Neon account (possibly anonymous)
- cluster (known as tenant in the storage?)
- branch or snapshot name
- timestamp (PITR)
- primary or read-replica
- one-off read replica
- one-off writeable branch
When you are using regular PostgreSQL authentication or anonymous
access, the connection URL needs to contain all the information needed
for the routing. With github single sign-on, the browser is involved
and some details - the Neon account in particular - can be deduced
from the authentication exchange.
There are three methods for identifying the PostgreSQL instance:
- Browser interaction (link auth)
- Options in the connection URL and the domain name
- A pre-defined endpoint, identified by domain name or IP address
### Link Auth
postgres://<username>@start.neon.tech/<dbname>
This gives you a link that you open in browser. Clicking the link
performs github authentication, and the Neon account name is
provided to the proxy behind the scenes. The proxy routes the
connection to the primary PostgreSQL instance in cluster called
"main", branch "main".
Further ideas:
- You could pre-define a different target for link auth
connections in the UI.
- You could have a drop-down in the browser, allowing you to connect
to any cluster you want. Link Auth can be like Teleport.
### Connection URL
The connection URL looks like this:
postgres://<username>@<cluster-id>.db.neon.tech/<dbname>
By default, this connects you to the primary PostgreSQL instance
running on the "main" branch in the named cluster [3]. However, you can
change that by specifying options in the connection URL. The following
options are supported:
| option name | Description | Examples |
| --- | --- | --- |
| cluster | Cluster name | cluster:myproject |
| branch | Branch name | branch:main |
| timestamp | Connect to an instance at given point-in-time. | timestamp:2022-04-08 timestamp:2022-04-08T11:42:16Z |
| lsn | Connect to an instance at given LSN | lsn:0/12FF0420 |
| read-replica | Connect to a read-replica. If the parameter is 'new', a new instance is created for this session. | read-replica read-replica:new |
For example, to read branch 'testing' as it was on Mar 31, 2022, you could
specify a timestamp in the connection URL [4]:
postgres://alice@cluster-1234.db.neon.tech/postgres?options=branch:testing,timestamp:2022-03-31
Connecting with cluster name and options can be disabled in the UI. If
disabled, you can only connect using a pre-defined endpoint.
### Pre-defined Endpoint
Instead of providing the cluster name, branch, and all those options
in the connection URL, you can define a named endpoint with the same
options.
In the UI, click "create endpoint". Fill in the details:
- Cluster name
- Branch
- timestamp or LSN
- is this for the primary or for a read replica
- etc.
When you click Finish, a named endpoint is created. You can now use the endpoint ID to connect:
postgres://<username>@<endpoint-id>.endpoint.neon.tech/<dbname>
An endpoint can be assigned a static or dynamic IP address, so that
you can connect to it with clients that don't support TLS SNI. Maybe
bypass the proxy altogether, but that ought to be invisible to the
user.
You can limit the range of source IP addresses that are allowed to
connect to an endpoint. An endpoint can also be exposed in an Amazon
VPC, allowing direct connections from applications.
# Footnotes
[1] I'm not sure how feasible it is to set up configure like Kerberos
or LDAP in a cloud environment. But in principle I think we should
allow customers to have the full power of PostgreSQL, including all
authentication options. However, it's up to the customer to configure
it correctly.
[2] Link is a way to both authenticate and to route the connection
[3] This assumes that cluster-ids are globally unique, across all
Neon accounts.
[4] The syntax accepted in the connection URL is limited by libpq. The
only way to pass arbitrary options to the server (or our proxy) is
with the "options" keyword, and the options must be percent-encoded. I
think the above would work but i haven't tested it

View File

@@ -49,7 +49,7 @@ topics.
RFC lifecycle:
- Should be submitted in a pull request with and full RFC text in a committed markdown file and copy of the Summary and Motivation sections also included in the PR body.
- Should be submitted in a pull request with and full RFC text in a commited markdown file and copy of the Summary and Motivation sections also included in the PR body.
- RFC should be published for review before most of the actual code is written. This isnt a strict rule, dont hesitate to experiment and build a POC in parallel with writing an RFC.
- Add labels to the PR in the same manner as you do Issues. Example TBD
- Request the review from your peers. Reviewing the RFCs from your peers is a priority, same as reviewing the actual code.

View File

@@ -22,8 +22,8 @@ so we don't want to give users access to the functionality that we don't think i
* pageserver - calculate the size consumed by a timeline and add it to the feedback message.
* safekeeper - pass feedback message from pageserver to compute.
* compute - receive feedback message, enforce size limit based on GUC `neon.max_cluster_size`.
* console - set and update `neon.max_cluster_size` setting
* compute - receive feedback message, enforce size limit based on GUC `zenith.max_cluster_size`.
* console - set and update `zenith.max_cluster_size` setting
## Proposed implementation
@@ -36,12 +36,12 @@ This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver.
Alternatively, we could count only relation data. As in pg_database_size().
This approach is somewhat more user-friendly because it is the data that is really affected by the user.
On the other hand, it puts us in a weaker position than other services, i.e., RDS.
We will need to refactor the timeline_size counter or add another counter to implement it.
We will need to refactor the timeline_size counter or add another counter to implement it.
Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
Then this size should be reported to compute node.
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.`
`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.`
(PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).
@@ -49,7 +49,7 @@ This message is received by the safekeeper and propagated to compute node as a p
Finally, when compute node receives the `current_timeline_size` from safekeeper (or from pageserver directly), it updates the global variable.
And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > neon.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so.
And then every zenith_extend() operation checks if limit is reached `(current_timeline_size > zenith.max_cluster_size)` and throws `ERRCODE_DISK_FULL` error if so.
(see Postgres error codes [https://www.postgresql.org/docs/devel/errcodes-appendix.html](https://www.postgresql.org/docs/devel/errcodes-appendix.html))
TODO:
@@ -64,16 +64,16 @@ We should warn users if the limit is soon to be reached.
### **Reliability, failure modes and corner cases**
1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.
### **Security implications**
We treat compute as an untrusted component. That's why we try to isolate it with secure container runtime or a VM.
Malicious users may change the `neon.max_cluster_size`, so we need an extra size limit check.
Malicious users may change the `zenith.max_cluster_size`, so we need an extra size limit check.
To cover this case, we also monitor the compute node size in the console.

Some files were not shown because too many files have changed in this diff Show More