Try larger sleep

Wait for pid death
Add hacky solution
2026-07-17 02:50:38 +00:00 · 2022-08-12 09:52:40 -04:00 · 2022-08-12 09:21:44 -04:00 · 2022-08-12 09:05:51 -04:00 · 2022-08-12 09:01:17 -04:00 · 2022-08-12 19:13:42 +07:00
308 changed files with 19813 additions and 11119 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -0,0 +1,13 @@
+# The binaries are really slow, if you compile them in 'dev' mode with the defaults.
+# Enable some optimizations even in 'dev' mode, to make tests faster. The basic
+# optimizations enabled by "opt-level=1" don't affect debuggability too much.
+#
+# See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
+#
+[profile.dev.package."*"]
+# Set the default for dependencies in Development mode.
+opt-level = 3
+
+[profile.dev]
+# Turn on a small amount of optimization in Development mode.
+opt-level = 1
--- a/.circleci/ansible/systemd/safekeeper.service
+++ b/.circleci/ansible/systemd/safekeeper.service
@@ -1,18 +0,0 @@
-[Unit]
-Description=Zenith safekeeper
-After=network.target auditd.service
-
-[Service]
-Type=simple
-User=safekeeper
-Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
-ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -p {{ first_pageserver }}:6400 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="wal"}'
-ExecReload=/bin/kill -HUP $MAINPID
-KillMode=mixed
-KillSignal=SIGINT
-Restart=on-failure
-TimeoutSec=10
-LimitNOFILE=30000000
-
-[Install]
-WantedBy=multi-user.target
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,894 +0,0 @@
-version: 2.1
-
-executors:
-  neon-xlarge-executor:
-    resource_class: xlarge
-    docker:
-      # NB: when changed, do not forget to update rust image tag in all Dockerfiles
-      - image: zimg/rust:1.58
-  neon-executor:
-    docker:
-      - image: zimg/rust:1.58
-
-jobs:
-  # A job to build postgres
-  build-postgres:
-    executor: neon-xlarge-executor
-    parameters:
-      build_type:
-        type: enum
-        enum: ["debug", "release"]
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
-    steps:
-        # Checkout the git repo (circleci doesn't have a flag to enable submodules here)
-      - checkout
-
-        # Grab the postgres git revision to build a cache key.
-        # Append makefile as it could change the way postgres is built.
-        # Note this works even though the submodule hasn't been checkout out yet.
-      - run:
-          name: Get postgres cache key
-          command: |
-              git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
-              cat Makefile >> /tmp/cache-key-postgres
-
-      - restore_cache:
-          name: Restore postgres cache
-          keys:
-            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
-
-        # Build postgres if the restore_cache didn't find a build.
-        # `make` can't figure out whether the cache is valid, since
-        # it only compares file timestamps.
-      - run:
-          name: build postgres
-          command: |
-            if [ ! -e tmp_install/bin/postgres ]; then
-              # "depth 1" saves some time by not cloning the whole repo
-              git submodule update --init --depth 1
-              # bail out on any warnings
-              COPT='-Werror' mold -run make postgres -j$(nproc)
-            fi
-
-      - save_cache:
-          name: Save postgres cache
-          key: v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
-          paths:
-            - tmp_install
-
-  # A job to build Neon rust code
-  build-neon:
-    executor: neon-xlarge-executor
-    parameters:
-      build_type:
-        type: enum
-        enum: ["debug", "release"]
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
-    steps:
-        # Checkout the git repo (without submodules)
-      - checkout
-
-        # Grab the postgres git revision to build a cache key.
-        # Append makefile as it could change the way postgres is built.
-        # Note this works even though the submodule hasn't been checkout out yet.
-      - run:
-          name: Get postgres cache key
-          command: |
-            git rev-parse HEAD:vendor/postgres > /tmp/cache-key-postgres
-            cat Makefile >> /tmp/cache-key-postgres
-
-
-      - restore_cache:
-          name: Restore postgres cache
-          keys:
-            # Restore ONLY if the rev key matches exactly
-            - v04-postgres-cache-<< parameters.build_type >>-{{ checksum "/tmp/cache-key-postgres" }}
-
-      - restore_cache:
-          name: Restore rust cache
-          keys:
-            # Require an exact match. While an out of date cache might speed up the build,
-            # there's no way to clean out old packages, so the cache grows every time something
-            # changes.
-            - v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
-
-        # Build the rust code, including test binaries
-      - run:
-          name: Rust build << parameters.build_type >>
-          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-              CARGO_FLAGS=
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-              CARGO_FLAGS="--release --features profiling"
-            fi
-
-            export CARGO_INCREMENTAL=0
-            export CACHEPOT_BUCKET=zenith-rust-cachepot
-            export RUSTC_WRAPPER=cachepot
-            export AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}"
-            "${cov_prefix[@]}" mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
-            cachepot -s
-
-      - save_cache:
-          name: Save rust cache
-          key: v04-rust-cache-deps-<< parameters.build_type >>-{{ checksum "Cargo.lock" }}
-          paths:
-            - ~/.cargo/registry
-            - ~/.cargo/git
-            - target
-
-        # Run rust unit tests
-      - run:
-          name: cargo test
-          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-              CARGO_FLAGS=
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-              CARGO_FLAGS=--release
-            fi
-
-            "${cov_prefix[@]}" cargo test $CARGO_FLAGS
-
-        # Install the rust binaries, for use by test jobs
-      - run:
-          name: Install rust binaries
-          command: |
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
-            binaries=$(
-              "${cov_prefix[@]}" cargo metadata --format-version=1 --no-deps |
-              jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
-            )
-
-            test_exe_paths=$(
-              "${cov_prefix[@]}" cargo test --message-format=json --no-run |
-              jq -r '.executable | select(. != null)'
-            )
-
-            mkdir -p /tmp/zenith/bin
-            mkdir -p /tmp/zenith/test_bin
-            mkdir -p /tmp/zenith/etc
-
-            # Install target binaries
-            for bin in $binaries; do
-              SRC=target/$BUILD_TYPE/$bin
-              DST=/tmp/zenith/bin/$bin
-              cp $SRC $DST
-              echo $DST >> /tmp/zenith/etc/binaries.list
-            done
-
-            # Install test executables (for code coverage)
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              for bin in $test_exe_paths; do
-                SRC=$bin
-                DST=/tmp/zenith/test_bin/$(basename $bin)
-                cp $SRC $DST
-                echo $DST >> /tmp/zenith/etc/binaries.list
-              done
-            fi
-
-        # Install the postgres binaries, for use by test jobs
-      - run:
-          name: Install postgres binaries
-          command: |
-            cp -a tmp_install /tmp/zenith/pg_install
-
-      - run:
-          name: Merge coverage data
-          command: |
-            # This will speed up workspace uploads
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
-            fi
-
-        # Save the rust binaries and coverage data for other jobs in this workflow.
-      - persist_to_workspace:
-          root: /tmp/zenith
-          paths:
-            - "*"
-
-  check-codestyle-python:
-    executor: neon-executor
-    steps:
-      - checkout
-      - restore_cache:
-          keys:
-            - v2-python-deps-{{ checksum "poetry.lock" }}
-      - run:
-          name: Install deps
-          command: ./scripts/pysync
-      - save_cache:
-          key: v2-python-deps-{{ checksum "poetry.lock" }}
-          paths:
-            - /home/circleci/.cache/pypoetry/virtualenvs
-      - run:
-          name: Print versions
-          when: always
-          command: |
-              poetry run python --version
-              poetry show
-      - run:
-          name: Run yapf to ensure code format
-          when: always
-          command: poetry run yapf --recursive --diff .
-      - run:
-          name: Run mypy to check types
-          when: always
-          command: poetry run mypy .
-
-  run-pytest:
-    executor: neon-executor
-    parameters:
-      # pytest args to specify the tests to run.
-      #
-      # This can be a test file name, e.g. 'test_pgbench.py, or a subdirectory,
-      # or '-k foobar' to run tests containing string 'foobar'. See pytest man page
-      # section SPECIFYING TESTS / SELECTING TESTS for details.
-      #
-      # Select the type of Rust build. Must be "release" or "debug".
-      build_type:
-        type: string
-        default: "debug"
-      # This parameter is required, to prevent the mistake of running all tests in one job.
-      test_selection:
-        type: string
-        default: ""
-      # Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr
-      extra_params:
-        type: string
-        default: ""
-      needs_postgres_source:
-        type: boolean
-        default: false
-      run_in_parallel:
-        type: boolean
-        default: true
-      save_perf_report:
-        type: boolean
-        default: false
-    environment:
-      BUILD_TYPE: << parameters.build_type >>
-    steps:
-      - attach_workspace:
-          at: /tmp/zenith
-      - checkout
-      - when:
-          condition: << parameters.needs_postgres_source >>
-          steps:
-            - run: git submodule update --init --depth 1
-      - restore_cache:
-          keys:
-            - v2-python-deps-{{ checksum "poetry.lock" }}
-      - run:
-          name: Install deps
-          command: ./scripts/pysync
-      - save_cache:
-          key: v2-python-deps-{{ checksum "poetry.lock" }}
-          paths:
-            - /home/circleci/.cache/pypoetry/virtualenvs
-      - run:
-          name: Run pytest
-          # pytest doesn't output test logs in real time, so CI job may fail with
-          # `Too long with no output` error, if a test is running for a long time.
-          # In that case, tests should have internal timeouts that are less than
-          # no_output_timeout, specified here.
-          no_output_timeout: 10m
-          environment:
-            - ZENITH_BIN: /tmp/zenith/bin
-            - POSTGRES_DISTRIB_DIR: /tmp/zenith/pg_install
-            - TEST_OUTPUT: /tmp/test_output
-            # this variable will be embedded in perf test report
-            # and is needed to distinguish different environments
-            - PLATFORM: zenith-local-ci
-          command: |
-            PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
-            rm -rf $PERF_REPORT_DIR
-
-            TEST_SELECTION="test_runner/<< parameters.test_selection >>"
-            EXTRA_PARAMS="<< parameters.extra_params >>"
-            if [ -z "$TEST_SELECTION" ]; then
-              echo "test_selection must be set"
-              exit 1
-            fi
-            if << parameters.run_in_parallel >>; then
-              EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
-            fi
-            if << parameters.save_perf_report >>; then
-              if [[ $CIRCLE_BRANCH == "main" ]]; then
-                mkdir -p "$PERF_REPORT_DIR"
-                EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
-              fi
-            fi
-
-            export GITHUB_SHA=$CIRCLE_SHA1
-
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              cov_prefix=(scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage run)
-            elif [[ $BUILD_TYPE == "release" ]]; then
-              cov_prefix=()
-            fi
-
-            # Run the tests.
-            #
-            # The junit.xml file allows CircleCI to display more fine-grained test information
-            # in its "Tests" tab in the results page.
-            # --verbose prints name of each test (helpful when there are
-            # multiple tests in one file)
-            # -rA prints summary in the end
-            # -n4 uses four processes to run tests via pytest-xdist
-            # -s is not used to prevent pytest from capturing output, because tests are running
-            # in parallel and logs are mixed between different tests
-            "${cov_prefix[@]}" ./scripts/pytest \
-              --junitxml=$TEST_OUTPUT/junit.xml \
-              --tb=short \
-              --verbose \
-              -m "not remote_cluster" \
-              -rA $TEST_SELECTION $EXTRA_PARAMS
-
-            if << parameters.save_perf_report >>; then
-              if [[ $CIRCLE_BRANCH == "main" ]]; then
-                export REPORT_FROM="$PERF_REPORT_DIR"
-                export REPORT_TO=local
-                scripts/generate_and_push_perf_report.sh
-              fi
-            fi
-      - run:
-          # CircleCI artifacts are preserved one file at a time, so skipping
-          # this step isn't a good idea. If you want to extract the
-          # pageserver state, perhaps a tarball would be a better idea.
-          name: Delete all data but logs
-          when: always
-          command: |
-            du -sh /tmp/test_output/*
-            find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
-            du -sh /tmp/test_output/*
-      - store_artifacts:
-          path: /tmp/test_output
-      # The store_test_results step tells CircleCI where to find the junit.xml file.
-      - store_test_results:
-          path: /tmp/test_output
-      - run:
-          name: Merge coverage data
-          command: |
-            # This will speed up workspace uploads
-            if [[ $BUILD_TYPE == "debug" ]]; then
-              scripts/coverage "--profraw-prefix=$CIRCLE_JOB" --dir=/tmp/zenith/coverage merge
-            fi
-      # Save coverage data (if any)
-      - persist_to_workspace:
-          root: /tmp/zenith
-          paths:
-            - "*"
-
-  coverage-report:
-    executor: neon-xlarge-executor
-    steps:
-      - attach_workspace:
-          at: /tmp/zenith
-      - checkout
-      - restore_cache:
-          name: Restore rust cache
-          keys:
-            # Require an exact match. While an out of date cache might speed up the build,
-            # there's no way to clean out old packages, so the cache grows every time something
-            # changes.
-            - v04-rust-cache-deps-debug-{{ checksum "Cargo.lock" }}
-      - run:
-          name: Build coverage report
-          command: |
-            COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
-
-            scripts/coverage \
-              --dir=/tmp/zenith/coverage report \
-              --input-objects=/tmp/zenith/etc/binaries.list \
-              --commit-url=$COMMIT_URL \
-              --format=github
-      - run:
-          name: Upload coverage report
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-            REPORT_URL=https://neondatabase.github.io/zenith-coverage-data/$CIRCLE_SHA1
-            COMMIT_URL=https://github.com/neondatabase/neon/commit/$CIRCLE_SHA1
-
-            scripts/git-upload \
-              --repo=https://$VIP_VAP_ACCESS_TOKEN@github.com/neondatabase/zenith-coverage-data.git \
-              --message="Add code coverage for $COMMIT_URL" \
-              copy /tmp/zenith/coverage/report $CIRCLE_SHA1 # COPY FROM TO_RELATIVE
-
-            # Add link to the coverage report to the commit
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"success\",
-                \"context\": \"zenith-coverage\",
-                \"description\": \"Coverage report is ready\",
-                \"target_url\": \"$REPORT_URL\"
-              }"
-
-  # Build neondatabase/neon:latest image and push it to Docker hub
-  docker-image:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build \
-              --pull \
-              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:latest .
-            docker push neondatabase/neon:${DOCKER_TAG}
-            docker push neondatabase/neon:latest
-
-  # Build neondatabase/compute-node:latest image and push it to Docker hub
-  docker-image-compute:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Build and push compute-tools Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            docker build \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:local \
-              --tag neondatabase/compute-tools:latest \
-              -f Dockerfile.compute-tools .
-            # Only push :latest image
-            docker push neondatabase/compute-tools:latest
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push compute-node Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
-              --tag neondatabase/compute-node:latest vendor/postgres \
-              --build-arg COMPUTE_TOOLS_TAG=local
-            docker push neondatabase/compute-node:${DOCKER_TAG}
-            docker push neondatabase/compute-node:latest
-
-  # Build production neondatabase/neon:release image and push it to Docker hub
-  docker-image-release:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            docker build \
-              --pull \
-              --build-arg GIT_VERSION=${CIRCLE_SHA1} \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/neon:${DOCKER_TAG} --tag neondatabase/neon:release .
-            docker push neondatabase/neon:${DOCKER_TAG}
-            docker push neondatabase/neon:release
-
-  # Build production neondatabase/compute-node:release image and push it to Docker hub
-  docker-image-compute-release:
-    docker:
-      - image: cimg/base:2021.04
-    steps:
-      - checkout
-      - setup_remote_docker:
-          docker_layer_caching: true
-      - run:
-          name: Build and push compute-tools Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            docker build \
-              --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
-              --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:release \
-              --tag neondatabase/compute-tools:local \
-              -f Dockerfile.compute-tools .
-            # Only push :release image
-            docker push neondatabase/compute-tools:release
-      - run:
-          name: Init postgres submodule
-          command: git submodule update --init --depth 1
-      - run:
-          name: Build and push compute-node Docker image
-          command: |
-            echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
-            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
-              --tag neondatabase/compute-node:release vendor/postgres \
-              --build-arg COMPUTE_TOOLS_TAG=local
-            docker push neondatabase/compute-node:${DOCKER_TAG}
-            docker push neondatabase/compute-node:release
-
-  deploy-staging:
-    docker:
-      - image: cimg/python:3.10
-    steps:
-      - checkout
-      - setup_remote_docker
-      - run:
-          name: Setup ansible
-          command: |
-            pip install --progress-bar off --user ansible boto3
-      - run:
-          name: Redeploy
-          command: |
-            cd "$(pwd)/.circleci/ansible"
-
-            ./get_binaries.sh
-
-            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
-            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-            chmod 0600 ssh-key
-            ssh-add ssh-key
-            rm -f ssh-key ssh-key-cert.pub
-
-            ansible-playbook deploy.yaml -i staging.hosts
-            rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-staging-proxy:
-    docker:
-      - image: cimg/base:2021.04
-    environment:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - checkout
-      - run:
-          name: Store kubeconfig file
-          command: |
-            echo "${STAGING_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
-            chmod 0600 ${KUBECONFIG}
-      - run:
-          name: Setup helm v3
-          command: |
-            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add neondatabase https://neondatabase.github.io/helm-charts
-      - run:
-          name: Re-deploy proxy
-          command: |
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
-            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/staging.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
-
-  deploy-neon-stress:
-    docker:
-      - image: cimg/python:3.10
-    steps:
-      - checkout
-      - setup_remote_docker
-      - run:
-          name: Setup ansible
-          command: |
-            pip install --progress-bar off --user ansible boto3
-      - run:
-          name: Redeploy
-          command: |
-            cd "$(pwd)/.circleci/ansible"
-
-            ./get_binaries.sh
-
-            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
-            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-            chmod 0600 ssh-key
-            ssh-add ssh-key
-            rm -f ssh-key ssh-key-cert.pub
-
-            ansible-playbook deploy.yaml -i neon-stress.hosts
-            rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-neon-stress-proxy:
-    docker:
-      - image: cimg/base:2021.04
-    environment:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - checkout
-      - run:
-          name: Store kubeconfig file
-          command: |
-            echo "${NEON_STRESS_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
-            chmod 0600 ${KUBECONFIG}
-      - run:
-          name: Setup helm v3
-          command: |
-            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add neondatabase https://neondatabase.github.io/helm-charts
-      - run:
-          name: Re-deploy proxy
-          command: |
-            DOCKER_TAG=$(git log --oneline|wc -l)
-            helm upgrade neon-stress-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
-            helm upgrade neon-stress-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/neon-stress.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
-
-  deploy-release:
-    docker:
-      - image: cimg/python:3.10
-    steps:
-      - checkout
-      - setup_remote_docker
-      - run:
-          name: Setup ansible
-          command: |
-            pip install --progress-bar off --user ansible boto3
-      - run:
-          name: Redeploy
-          command: |
-            cd "$(pwd)/.circleci/ansible"
-
-            RELEASE=true ./get_binaries.sh
-
-            echo "${TELEPORT_SSH_KEY}"  | tr -d '\n'| base64 --decode >ssh-key
-            echo "${TELEPORT_SSH_CERT}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-            chmod 0600 ssh-key
-            ssh-add ssh-key
-            rm -f ssh-key ssh-key-cert.pub
-
-            ansible-playbook deploy.yaml -i production.hosts
-            rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-release-proxy:
-    docker:
-      - image: cimg/base:2021.04
-    environment:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - checkout
-      - run:
-          name: Store kubeconfig file
-          command: |
-            echo "${PRODUCTION_KUBECONFIG_DATA}" | base64 --decode > ${KUBECONFIG}
-            chmod 0600 ${KUBECONFIG}
-      - run:
-          name: Setup helm v3
-          command: |
-            curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-            helm repo add neondatabase https://neondatabase.github.io/helm-charts
-      - run:
-          name: Re-deploy proxy
-          command: |
-            DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            helm upgrade neon-proxy       neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy.yaml --set image.tag=${DOCKER_TAG} --wait
-            helm upgrade neon-proxy-scram neondatabase/neon-proxy --install -f .circleci/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait
-
-  # Trigger a new remote CI job
-  remote-ci-trigger:
-    docker:
-      - image: cimg/base:2021.04
-    parameters:
-      remote_repo:
-        type: string
-    environment:
-      REMOTE_REPO: << parameters.remote_repo >>
-    steps:
-      - run:
-          name: Set PR's status to pending
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-
-            curl -f -X POST \
-            https://api.github.com/repos/$LOCAL_REPO/statuses/$CIRCLE_SHA1 \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"state\": \"pending\",
-                \"context\": \"neon-cloud-e2e\",
-                \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-              }"
-      - run:
-          name: Request a remote CI test
-          command: |
-            LOCAL_REPO=$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME
-
-            curl -f -X POST \
-            https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
-            -H "Accept: application/vnd.github.v3+json" \
-            --user "$CI_ACCESS_TOKEN" \
-            --data \
-              "{
-                \"ref\": \"main\",
-                \"inputs\": {
-                  \"ci_job_name\": \"neon-cloud-e2e\",
-                  \"commit_hash\": \"$CIRCLE_SHA1\",
-                  \"remote_repo\": \"$LOCAL_REPO\"
-                }
-              }"
-
-workflows:
-  build_and_test:
-    jobs:
-      - check-codestyle-python
-      - build-postgres:
-          name: build-postgres-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-      - build-neon:
-          name: build-neon-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-          requires:
-            - build-postgres-<< matrix.build_type >>
-      - run-pytest:
-          name: pg_regress-tests-<< matrix.build_type >>
-          context: PERF_TEST_RESULT_CONNSTR
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-          test_selection: batch_pg_regress
-          needs_postgres_source: true
-          requires:
-            - build-neon-<< matrix.build_type >>
-      - run-pytest:
-          name: other-tests-<< matrix.build_type >>
-          matrix:
-            parameters:
-              build_type: ["debug", "release"]
-          test_selection: batch_others
-          requires:
-            - build-neon-<< matrix.build_type >>
-      - run-pytest:
-          name: benchmarks
-          context: PERF_TEST_RESULT_CONNSTR
-          build_type: release
-          test_selection: performance
-          run_in_parallel: false
-          save_perf_report: true
-          requires:
-            - build-neon-release
-      - coverage-report:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          requires:
-            # TODO: consider adding more
-            - other-tests-debug
-      - docker-image:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - docker-image-compute:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - deploy-staging:
-          # Context gives an ability to login
-          context: Docker Hub
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-      - deploy-staging-proxy:
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-
-      - deploy-neon-stress:
-          # Context gives an ability to login
-          context: Docker Hub
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-      - deploy-neon-stress-proxy:
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - main
-          requires:
-            - docker-image
-
-      - docker-image-release:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - docker-image-compute-release:
-          # Context gives an ability to login
-          context: Docker Hub
-          # Build image only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - pg_regress-tests-release
-            - other-tests-release
-      - deploy-release:
-          # Context gives an ability to login
-          context: Docker Hub
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - docker-image-release
-      - deploy-release-proxy:
-          # deploy only for commits to main
-          filters:
-            branches:
-              only:
-                - release
-          requires:
-            - docker-image-release
-      - remote-ci-trigger:
-          # Context passes credentials for gh api
-          context: CI_ACCESS_TOKEN
-          remote_repo: "neondatabase/cloud"
-          requires:
-            # XXX: Successful build doesn't mean everything is OK, but
-            # the job to be triggered takes so much time to complete (~22 min)
-            # that it's better not to wait for the commented-out steps
-            - build-neon-release
-            # - pg_regress-tests-release
-            # - other-tests-release
--- a/.dockerignore
+++ b/.dockerignore
@@ -9,8 +9,8 @@ tmp_install
 tmp_check_cli
 test_output
 .vscode
-.zenith
-integration_tests/.zenith
+.neon
+integration_tests/.neon
 .mypy_cache

 Dockerfile
--- a/.github/actions/download/action.yml
+++ b/.github/actions/download/action.yml
@@ -0,0 +1,56 @@
+name: "Download an artifact"
+description: "Custom download action"
+inputs:
+  name:
+    description: "Artifact name"
+    required: true
+  path:
+    description: "A directory to put artifact into"
+    default: "."
+    required: false
+  skip-if-does-not-exist:
+    description: "Allow to skip if file doesn't exist, fail otherwise"
+    default: false
+    required: false
+
+runs:
+  using: "composite"
+  steps:
+    - name: Download artifact
+      id: download-artifact
+      shell: bash -euxo pipefail {0}
+      env:
+        TARGET: ${{ inputs.path }}
+        ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
+        SKIP_IF_DOES_NOT_EXIST: ${{ inputs.skip-if-does-not-exist }}
+      run: |
+        BUCKET=neon-github-public-dev
+        PREFIX=artifacts/${GITHUB_RUN_ID}
+        FILENAME=$(basename $ARCHIVE)
+
+        S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${PREFIX} | jq -r '.Contents[].Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
+        if [ -z "${S3_KEY}" ]; then
+          if [ "${SKIP_IF_DOES_NOT_EXIST}" = "true" ]; then
+            echo '::set-output name=SKIPPED::true'
+            exit 0
+          else
+            echo 2>&1 "Neither s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME} nor its version from previous attempts exist"
+            exit 1
+          fi
+        fi
+
+        echo '::set-output name=SKIPPED::false'
+
+        mkdir -p $(dirname $ARCHIVE)
+        time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} ${ARCHIVE}
+
+    - name: Extract artifact
+      if: ${{ steps.download-artifact.outputs.SKIPPED == 'false' }}
+      shell: bash -euxo pipefail {0}
+      env:
+        TARGET: ${{ inputs.path }}
+        ARCHIVE: /tmp/downloads/${{ inputs.name }}.tar.zst
+      run: |
+        mkdir -p ${TARGET}
+        time tar -xf ${ARCHIVE} -C ${TARGET}
+        rm -f ${ARCHIVE}
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -0,0 +1,162 @@
+name: 'Run python test'
+description: 'Runs a Neon python test set, performing all the required preparations before'
+
+inputs:
+  build_type:
+    description: 'Type of Rust (neon) and C (postgres) builds. Must be "release" or "debug".'
+    required: true
+  rust_toolchain:
+    description: 'Rust toolchain version to fetch the caches'
+    required: true
+  test_selection:
+    description: 'A python test suite to run'
+    required: true
+  extra_params:
+    description: 'Arbitrary parameters to pytest. For example "-s" to prevent capturing stdout/stderr'
+    required: false
+    default: ''
+  needs_postgres_source:
+    description: 'Set to true if the test suite requires postgres source checked out'
+    required: false
+    default: 'false'
+  run_in_parallel:
+    description: 'Whether to run tests in parallel'
+    required: false
+    default: 'true'
+  save_perf_report:
+    description: 'Whether to upload the performance report'
+    required: false
+    default: 'false'
+  run_with_real_s3:
+    description: 'Whether to pass real s3 credentials to the test suite'
+    required: false
+    default: 'false'
+  real_s3_bucket:
+    description: 'Bucket name for real s3 tests'
+    required: false
+    default: ''
+  real_s3_region:
+    description: 'Region name for real s3 tests'
+    required: false
+    default: ''
+  real_s3_access_key_id:
+    description: 'Access key id'
+    required: false
+    default: ''
+  real_s3_secret_access_key:
+    description: 'Secret access key'
+    required: false
+    default: ''
+
+runs:
+  using: "composite"
+  steps:
+    - name: Get Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-artifact
+        path: /tmp/neon
+
+    - name: Checkout
+      if: inputs.needs_postgres_source == 'true'
+      uses: actions/checkout@v3
+      with:
+        submodules: true
+        fetch-depth: 1
+
+    - name: Cache poetry deps
+      id: cache_poetry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -euxo pipefail {0}
+      run: ./scripts/pysync
+
+    - name: Run pytest
+      env:
+        NEON_BIN: /tmp/neon/bin
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+        TEST_OUTPUT: /tmp/test_output
+        # this variable will be embedded in perf test report
+        # and is needed to distinguish different environments
+        PLATFORM: github-actions-selfhosted
+        BUILD_TYPE: ${{ inputs.build_type }}
+        AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
+        AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+      shell: bash -euxo pipefail {0}
+      run: |
+        PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+        rm -rf $PERF_REPORT_DIR
+
+        TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
+        EXTRA_PARAMS="${{ inputs.extra_params }}"
+        if [ -z "$TEST_SELECTION" ]; then
+          echo "test_selection must be set"
+          exit 1
+        fi
+        if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
+          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+        fi
+
+        if [[ "${{ inputs.run_with_real_s3 }}" == "true" ]]; then
+          echo "REAL S3 ENABLED"
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=${{ inputs.real_s3_bucket }}
+          export REMOTE_STORAGE_S3_REGION=${{ inputs.real_s3_region }}
+        fi
+
+        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
+          if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
+            mkdir -p "$PERF_REPORT_DIR"
+            EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
+          fi
+        fi
+
+        if [[ "${{ inputs.build_type }}" == "debug" ]]; then
+          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+        elif [[ "${{ inputs.build_type }}" == "release" ]]; then
+          cov_prefix=()
+        fi
+
+        # Run the tests.
+        #
+        # The junit.xml file allows CI tools to display more fine-grained test information
+        # in its "Tests" tab in the results page.
+        # --verbose prints name of each test (helpful when there are
+        # multiple tests in one file)
+        # -rA prints summary in the end
+        # -n4 uses four processes to run tests via pytest-xdist
+        # -s is not used to prevent pytest from capturing output, because tests are running
+        # in parallel and logs are mixed between different tests
+        "${cov_prefix[@]}" ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
+          --tb=short \
+          --verbose \
+          -m "not remote_cluster" \
+          -rA $TEST_SELECTION $EXTRA_PARAMS
+
+        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
+          if [[ "$GITHUB_REF" == "refs/heads/main" ]]; then
+            export REPORT_FROM="$PERF_REPORT_DIR"
+            export REPORT_TO=local
+            scripts/generate_and_push_perf_report.sh
+          fi
+        fi
+
+    - name: Delete all data but logs
+      shell: bash -euxo pipefail {0}
+      if: always()
+      run: |
+        du -sh /tmp/test_output/*
+        find /tmp/test_output -type f ! -name "*.log" ! -name "regression.diffs" ! -name "junit.xml" ! -name "*.filediff" ! -name "*.stdout" ! -name "*.stderr" ! -name "flamegraph.svg" ! -name "*.metrics" -delete
+        du -sh /tmp/test_output/*
+
+    - name: Upload python test logs
+      if: always()
+      uses: ./.github/actions/upload
+      with:
+        name: python-test-${{ inputs.test_selection }}-${{ runner.os }}-${{ inputs.build_type }}-${{ inputs.rust_toolchain }}-logs
+        path: /tmp/test_output/
--- a/.github/actions/save-coverage-data/action.yml
+++ b/.github/actions/save-coverage-data/action.yml
@@ -0,0 +1,22 @@
+name: 'Merge and upload coverage data'
+description: 'Compresses and uploads the coverage data as an artifact'
+
+runs:
+  using: "composite"
+  steps:
+    - name: Merge coverage data
+      shell: bash -euxo pipefail {0}
+      run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+    - name: Download previous coverage data into the same directory
+      uses: ./.github/actions/download
+      with:
+        name: coverage-data-artifact
+        path: /tmp/coverage
+        skip-if-does-not-exist: true # skip if there's no previous coverage to download
+
+    - name: Upload coverage data
+      uses: ./.github/actions/upload
+      with:
+        name: coverage-data-artifact
+        path: /tmp/coverage
--- a/.github/actions/upload/action.yml
+++ b/.github/actions/upload/action.yml
@@ -0,0 +1,51 @@
+name: "Upload an artifact"
+description: "Custom upload action"
+inputs:
+  name:
+    description: "Artifact name"
+    required: true
+  path:
+    description: "A directory or file to upload"
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Prepare artifact
+      shell: bash -euxo pipefail {0}
+      env:
+        SOURCE: ${{ inputs.path }}
+        ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
+      run: |
+        mkdir -p $(dirname $ARCHIVE)
+
+        if [ -f ${ARCHIVE} ]; then
+          echo 2>&1 "File ${ARCHIVE} already exist. Something went wrong before"
+          exit 1
+        fi
+
+        ZSTD_NBTHREADS=0
+        if [ -d  ${SOURCE} ]; then
+          time tar -C ${SOURCE} -cf ${ARCHIVE} --zstd .
+        elif [ -f ${SOURCE} ]; then
+          time tar -cf ${ARCHIVE} --zstd ${SOURCE}
+        else
+          echo 2>&1 "${SOURCE} neither directory nor file, don't know how to handle it"
+        fi
+
+    - name: Upload artifact
+      shell: bash -euxo pipefail {0}
+      env:
+        SOURCE: ${{ inputs.path }}
+        ARCHIVE: /tmp/uploads/${{ inputs.name }}.tar.zst
+      run: |
+        BUCKET=neon-github-public-dev
+        PREFIX=artifacts/${GITHUB_RUN_ID}
+        FILENAME=$(basename $ARCHIVE)
+
+        FILESIZE=$(du -sh ${ARCHIVE} | cut -f1)
+
+        time aws s3 mv --only-show-errors ${ARCHIVE} s3://${BUCKET}/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}
+
+        # Ref https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#adding-a-job-summary
+        echo "[${FILENAME}](https://${BUCKET}.s3.amazonaws.com/${PREFIX}/${GITHUB_RUN_ATTEMPT}/${FILENAME}) ${FILESIZE}" >> ${GITHUB_STEP_SUMMARY}
--- a/.circleci/ansible/.gitignore
+++ b/.circleci/ansible/.gitignore
--- a/.circleci/ansible/ansible.cfg
+++ b/.circleci/ansible/ansible.cfg
@@ -6,5 +6,7 @@ timeout = 30

 [ssh_connection]
 ssh_args   = -F ./ansible.ssh.cfg
-scp_if_ssh = True
+# teleport doesn't support sftp yet https://github.com/gravitational/teleport/issues/7127
+# and scp neither worked for me
+transfer_method = piped
 pipelining = True
--- a/.circleci/ansible/ansible.ssh.cfg
+++ b/.circleci/ansible/ansible.ssh.cfg
@@ -1,3 +1,7 @@
+# Remove this once https://github.com/gravitational/teleport/issues/10918 is fixed
+# (use pre 8.5 option name to cope with old ssh in CI)
+PubkeyAcceptedKeyTypes +ssh-rsa-cert-v01@openssh.com
+
 Host tele.zenith.tech
    User admin
    Port 3023
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -57,7 +57,7 @@
      args:
        creates: "/storage/pageserver/data/tenants"
      environment:
-        ZENITH_REPO_DIR: "/storage/pageserver/data"
+        NEON_REPO_DIR: "/storage/pageserver/data"
        LD_LIBRARY_PATH: "/usr/local/lib"
      become: true
      tags:
@@ -131,7 +131,7 @@
      args:
        creates: "/storage/safekeeper/data/safekeeper.id"
      environment:
-        ZENITH_REPO_DIR: "/storage/safekeeper/data"
+        NEON_REPO_DIR: "/storage/safekeeper/data"
        LD_LIBRARY_PATH: "/usr/local/lib"
      become: true
      tags:
--- a/.circleci/ansible/get_binaries.sh
+++ b/.circleci/ansible/get_binaries.sh
--- a/.circleci/ansible/neon-stress.hosts
+++ b/.circleci/ansible/neon-stress.hosts
@@ -12,6 +12,7 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = neon-stress
 console_mgmt_base_url = http://neon-stress-console.local
 bucket_name           = neon-storage-ireland
 bucket_region         = eu-west-1
--- a/.circleci/ansible/production.hosts
+++ b/.circleci/ansible/production.hosts
@@ -1,6 +1,7 @@
 [pageservers]
 #zenith-1-ps-1 console_region_id=1
 zenith-1-ps-2 console_region_id=1
+zenith-1-ps-3 console_region_id=1

 [safekeepers]
 zenith-1-sk-1 console_region_id=1
@@ -12,7 +13,8 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = prod-1
 console_mgmt_base_url = http://console-release.local
 bucket_name           = zenith-storage-oregon
 bucket_region         = us-west-2
-etcd_endpoints        = etcd-release.local:2379
+etcd_endpoints        = zenith-1-etcd.local:2379
--- a/.circleci/ansible/scripts/init_pageserver.sh
+++ b/.circleci/ansible/scripts/init_pageserver.sh
--- a/.circleci/ansible/scripts/init_safekeeper.sh
+++ b/.circleci/ansible/scripts/init_safekeeper.sh
@@ -12,10 +12,9 @@ cat <<EOF | tee /tmp/payload
  "version": 1,
  "host": "${HOST}",
  "port": 6500,
+  "http_port": 7676,
  "region_id": {{ console_region_id }},
-  "instance_id": "${INSTANCE_ID}",
-  "http_host": "${HOST}",
-  "http_port": 7676
+  "instance_id": "${INSTANCE_ID}"
 }
 EOF

--- a/.circleci/ansible/staging.hosts
+++ b/.circleci/ansible/staging.hosts
@@ -1,6 +1,7 @@
 [pageservers]
 #zenith-us-stage-ps-1 console_region_id=27
 zenith-us-stage-ps-2 console_region_id=27
+zenith-us-stage-ps-3 console_region_id=27

 [safekeepers]
 zenith-us-stage-sk-4 console_region_id=27
@@ -12,7 +13,8 @@ pageservers
 safekeepers

 [storage:vars]
+env_name = us-stage
 console_mgmt_base_url = http://console-staging.local
 bucket_name           = zenith-staging-storage-us-east-1
 bucket_region         = us-east-1
-etcd_endpoints        = etcd-staging.local:2379
+etcd_endpoints        = zenith-us-stage-etcd.local:2379
--- a/.circleci/ansible/systemd/pageserver.service
+++ b/.circleci/ansible/systemd/pageserver.service
@@ -5,7 +5,7 @@ After=network.target auditd.service
 [Service]
 Type=simple
 User=pageserver
-Environment=RUST_BACKTRACE=1 ZENITH_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/pageserver LD_LIBRARY_PATH=/usr/local/lib
 ExecStart=/usr/local/bin/pageserver -c "pg_distrib_dir='/usr/local'" -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" -c "broker_endpoints=['{{ etcd_endpoints }}']" -D /storage/pageserver/data
 ExecReload=/bin/kill -HUP $MAINPID
 KillMode=mixed
--- a/.github/ansible/systemd/safekeeper.service
+++ b/.github/ansible/systemd/safekeeper.service
@@ -0,0 +1,18 @@
+[Unit]
+Description=Zenith safekeeper
+After=network.target auditd.service
+
+[Service]
+Type=simple
+User=safekeeper
+Environment=RUST_BACKTRACE=1 NEON_REPO_DIR=/storage/safekeeper/data LD_LIBRARY_PATH=/usr/local/lib
+ExecStart=/usr/local/bin/safekeeper -l {{ inventory_hostname }}.local:6500 --listen-http {{ inventory_hostname }}.local:7676 -D /storage/safekeeper/data --broker-endpoints={{ etcd_endpoints }} --remote-storage='{bucket_name="{{bucket_name}}", bucket_region="{{bucket_region}}", prefix_in_bucket="{{ env_name }}/wal"}'
+ExecReload=/bin/kill -HUP $MAINPID
+KillMode=mixed
+KillSignal=SIGINT
+Restart=on-failure
+TimeoutSec=10
+LimitNOFILE=30000000
+
+[Install]
+WantedBy=multi-user.target
--- a/.circleci/helm-values/neon-stress.proxy-scram.yaml
+++ b/.circleci/helm-values/neon-stress.proxy-scram.yaml
--- a/.circleci/helm-values/neon-stress.proxy.yaml
+++ b/.circleci/helm-values/neon-stress.proxy.yaml
--- a/.circleci/helm-values/production.proxy-scram.yaml
+++ b/.circleci/helm-values/production.proxy-scram.yaml
--- a/.circleci/helm-values/production.proxy.yaml
+++ b/.circleci/helm-values/production.proxy.yaml
--- a/.circleci/helm-values/staging.proxy-scram.yaml
+++ b/.circleci/helm-values/staging.proxy-scram.yaml
--- a/.circleci/helm-values/staging.proxy.yaml
+++ b/.circleci/helm-values/staging.proxy.yaml
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -11,7 +11,7 @@ on:
    #          │ │ ┌───────────── day of the month (1 - 31)
    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '36 7 * * *' # run once a day, timezone is utc
+    - cron:  '36 4 * * *' # run once a day, timezone is utc

  workflow_dispatch: # adds ability to run this manually

@@ -26,11 +26,11 @@ jobs:
    runs-on: [self-hosted, zenith-benchmarker]

    env:
-      POSTGRES_DISTRIB_DIR: "/usr/pgsql-13"
+      POSTGRES_DISTRIB_DIR: "/usr/pgsql-14"

    steps:
    - name: Checkout zenith repo
-      uses: actions/checkout@v2
+      uses: actions/checkout@v3

    # actions/setup-python@v2 is not working correctly on self-hosted runners
    # see https://github.com/actions/setup-python/issues/162
@@ -60,7 +60,7 @@ jobs:
    - name: Setup cluster
      env:
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
-      shell: bash
+      shell: bash -euxo pipefail {0}
      run: |
        set -e

@@ -88,7 +88,7 @@ jobs:
        # Plus time needed to initialize the test databases.
        TEST_PG_BENCH_DURATIONS_MATRIX: "300"
        TEST_PG_BENCH_SCALES_MATRIX: "10,100"
-        PLATFORM: "zenith-staging"
+        PLATFORM: "neon-staging"
        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
        REMOTE_ENV: "1" # indicate to test harness that we do not have zenith binaries locally
      run: |
@@ -96,7 +96,7 @@ jobs:
        # since it might generate duplicates when calling ingest_perf_test_result.py
        rm -rf perf-report-staging
        mkdir -p perf-report-staging
-        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging
+        ./scripts/pytest test_runner/performance/ -v -m "remote_cluster" --skip-interfering-proc-check --out-dir perf-report-staging --timeout 3600

    - name: Submit result
      env:
@@ -104,3 +104,12 @@ jobs:
        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
      run: |
        REPORT_FROM=$(realpath perf-report-staging) REPORT_TO=staging scripts/generate_and_push_perf_report.sh
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic perf testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -0,0 +1,620 @@
+name: Test and Deploy
+
+on:
+  push:
+    branches:
+      - main
+      - release
+  pull_request:
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+  COPT: '-Werror'
+
+jobs:
+  build-neon:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        rust_toolchain: [ 1.58 ]
+
+    env:
+      BUILD_TYPE: ${{ matrix.build_type }}
+      GIT_VERSION: ${{ github.sha }}
+
+    steps:
+      - name: Fix git ownerwhip
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg revision for caching
+        id: pg_ver
+        run: echo ::set-output name=pg_rev::$(git rev-parse HEAD:vendor/postgres)
+
+      # Set some environment variables used by all the steps.
+      #
+      # CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
+      #   It also includes --features, if any
+      #
+      # CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
+      #   because "cargo metadata" doesn't accept --release or --debug options
+      #
+      - name: Set env variables
+        run: |
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
+            CARGO_FEATURES=""
+            CARGO_FLAGS=""
+          elif [[ $BUILD_TYPE == "release" ]]; then
+            cov_prefix=""
+            CARGO_FEATURES="--features profiling"
+            CARGO_FLAGS="--release $CARGO_FEATURES"
+          fi
+          echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
+          echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
+          echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+
+      # Don't include the ~/.cargo/registry/src directory. It contains just
+      # uncompressed versions of the crates in ~/.cargo/registry/cache
+      # directory, and it's faster to let 'cargo' to rebuild it from the
+      # compressed crates.
+      - name: Cache cargo deps
+        id: cache_cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry/
+            !~/.cargo/registry/src
+            ~/.cargo/git/
+            target/
+          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+          key: |
+            v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+            v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-
+
+      - name: Cache postgres build
+        id: cache_pg
+        uses: actions/cache@v3
+        with:
+          path: tmp_install/
+          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_ver.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres
+        if: steps.cache_pg.outputs.cache-hit != 'true'
+        run: mold -run make postgres -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          ${cov_prefix} mold -run cargo build $CARGO_FLAGS --features failpoints --bins --tests
+
+      - name: Run cargo test
+        run: |
+          ${cov_prefix} cargo test $CARGO_FLAGS
+
+      - name: Install rust binaries
+        run: |
+          # Install target binaries
+          mkdir -p /tmp/neon/bin/
+          binaries=$(
+            ${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
+            jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
+          )
+          for bin in $binaries; do
+            SRC=target/$BUILD_TYPE/$bin
+            DST=/tmp/neon/bin/$bin
+            cp "$SRC" "$DST"
+          done
+
+          # Install test executables and write list of all binaries (for code coverage)
+          if [[ $BUILD_TYPE == "debug" ]]; then
+            # Keep bloated coverage data files away from the rest of the artifact
+            mkdir -p /tmp/coverage/
+
+            mkdir -p /tmp/neon/test_bin/
+
+            test_exe_paths=$(
+              ${cov_prefix} cargo test $CARGO_FLAGS --message-format=json --no-run |
+              jq -r '.executable | select(. != null)'
+            )
+            for bin in $test_exe_paths; do
+              SRC=$bin
+              DST=/tmp/neon/test_bin/$(basename $bin)
+
+              # We don't need debug symbols for code coverage, so strip them out to make
+              # the artifact smaller.
+              strip "$SRC" -o "$DST"
+              echo "$DST" >> /tmp/coverage/binaries.list
+            done
+
+            for bin in $binaries; do
+              echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
+            done
+          fi
+
+      - name: Install postgres binaries
+        run: cp -a tmp_install /tmp/neon/pg_install
+
+      - name: Upload Neon artifact
+        uses: ./.github/actions/upload
+        with:
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          path: /tmp/neon
+
+      # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  pg_regress-tests:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    needs: [ build-neon ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Pytest regress tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ matrix.build_type }}
+          rust_toolchain: ${{ matrix.rust_toolchain }}
+          test_selection: batch_pg_regress
+          needs_postgres_source: true
+
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  other-tests:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    needs: [ build-neon ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug, release ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Pytest other tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ matrix.build_type }}
+          rust_toolchain: ${{ matrix.rust_toolchain }}
+          test_selection: batch_others
+          run_with_real_s3: true
+          real_s3_bucket: ci-tests-s3
+          real_s3_region: us-west-2
+          real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
+          real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
+      - name: Merge and upload coverage data
+        if: matrix.build_type == 'debug'
+        uses: ./.github/actions/save-coverage-data
+
+  benchmarks:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    needs: [ build-neon ]
+    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ release ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 2
+
+      - name: Pytest benchmarks
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: ${{ matrix.build_type }}
+          rust_toolchain: ${{ matrix.rust_toolchain }}
+          test_selection: performance
+          run_in_parallel: false
+          save_perf_report: true
+        env:
+          VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+      # XXX: no coverage data handling here, since benchmarks are run on release builds,
+      # while coverage is currently collected for the debug ones
+
+  coverage-report:
+    runs-on: dev
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rustlegacy:2746987948
+    needs: [ other-tests, pg_regress-tests ]
+    strategy:
+      fail-fast: false
+      matrix:
+        build_type: [ debug ]
+        rust_toolchain: [ 1.58 ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Restore cargo deps cache
+        id: cache_cargo
+        uses: actions/cache@v3
+        with:
+          path: |
+            ~/.cargo/registry/
+            !~/.cargo/registry/src
+            ~/.cargo/git/
+            target/
+          key: v3-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ matrix.rust_toolchain }}-${{ hashFiles('Cargo.lock') }}
+
+      - name: Get Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ matrix.build_type }}-${{ matrix.rust_toolchain }}-artifact
+          path: /tmp/neon
+
+      - name: Get coverage artifact
+        uses: ./.github/actions/download
+        with:
+          name: coverage-data-artifact
+          path: /tmp/coverage
+
+      - name: Merge coverage data
+        run: scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage merge
+
+      - name: Build and upload coverage report
+        run: |
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+          COMMIT_URL=https://github.com/${{ github.repository }}/commit/$COMMIT_SHA
+
+          scripts/coverage \
+            --dir=/tmp/coverage report \
+            --input-objects=/tmp/coverage/binaries.list \
+            --commit-url=$COMMIT_URL \
+            --format=github
+
+          REPORT_URL=https://${{ github.repository_owner }}.github.io/zenith-coverage-data/$COMMIT_SHA
+
+          scripts/git-upload \
+            --repo=https://${{ secrets.VIP_VAP_ACCESS_TOKEN }}@github.com/${{ github.repository_owner }}/zenith-coverage-data.git \
+            --message="Add code coverage for $COMMIT_URL" \
+            copy /tmp/coverage/report $COMMIT_SHA # COPY FROM TO_RELATIVE
+
+          # Add link to the coverage report to the commit
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"success\",
+              \"context\": \"neon-coverage\",
+              \"description\": \"Coverage report is ready\",
+              \"target_url\": \"$REPORT_URL\"
+            }"
+
+  trigger-e2e-tests:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ build-neon ]
+    steps:
+      - name: Set PR's status to pending and request a remote CI test
+        run: |
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
+
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"
+
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\"
+              }
+            }"
+
+  docker-image:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ pg_regress-tests, other-tests ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+        with:
+          driver: docker
+
+      - name: Get build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::$(git rev-list --count HEAD)"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: build-tag
+
+      - name: Get legacy build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::latest"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: legacy-build-tag
+
+      - name: Build neon Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION="${{github.sha}}"
+            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
+            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
+          pull: true
+          push: true
+          tags: neondatabase/neon:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/neon:${{steps.build-tag.outputs.tag}}
+
+  docker-image-compute:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    needs: [ pg_regress-tests, other-tests ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    outputs:
+      build-tag: ${{steps.build-tag.outputs.tag}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+        with:
+          driver: docker
+
+      - name: Get build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::$(git rev-list --count HEAD)"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release-$(git rev-list --count HEAD)"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: build-tag
+
+      - name: Get legacy build tag
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            echo "::set-output name=tag::latest"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            echo "::set-output name=tag::release"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+        id: legacy-build-tag
+
+      - name: Build compute-tools Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION="${{github.sha}}"
+            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
+            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
+          push: false
+          file: Dockerfile.compute-tools
+          tags: neondatabase/compute-tools:local
+
+      - name: Push compute-tools Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: .
+          build-args: |
+            GIT_VERSION="${{github.sha}}"
+            AWS_ACCESS_KEY_ID="${{secrets.CACHEPOT_AWS_ACCESS_KEY_ID}}"
+            AWS_SECRET_ACCESS_KEY="${{secrets.CACHEPOT_AWS_SECRET_ACCESS_KEY}}"
+          push: true
+          file: Dockerfile.compute-tools
+          tags: neondatabase/compute-tools:${{steps.legacy-build-tag.outputs.tag}}
+
+      - name: Build compute-node Docker image
+        uses: docker/build-push-action@v2
+        with:
+          context: ./vendor/postgres/
+          build-args:
+            COMPUTE_TOOLS_TAG=local
+          push: true
+          tags: neondatabase/compute-node:${{steps.legacy-build-tag.outputs.tag}}, neondatabase/compute-node:${{steps.build-tag.outputs.tag}}
+
+  calculate-deploy-targets:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    outputs:
+      matrix-include: ${{ steps.set-matrix.outputs.include }}
+    steps:
+      - id: set-matrix
+        run: |
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            STAGING='{"env_name": "staging", "proxy_job": "neon-proxy", "proxy_config": "staging.proxy", "kubeconfig_secret": "STAGING_KUBECONFIG_DATA"}'
+            NEON_STRESS='{"env_name": "neon-stress", "proxy_job": "neon-stress-proxy", "proxy_config": "neon-stress.proxy", "kubeconfig_secret": "NEON_STRESS_KUBECONFIG_DATA"}'
+            echo "::set-output name=include::[$STAGING, $NEON_STRESS]"
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA"}'
+            echo "::set-output name=include::[$PRODUCTION]"
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+  deploy:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    # We need both storage **and** compute images for deploy, because control plane
+    # picks the compute version based on the storage version. If it notices a fresh
+    # storage it may bump the compute version. And if compute image failed to build
+    # it may break things badly.
+    needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    strategy:
+      matrix:
+        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Setup ansible
+        run: |
+          pip install --progress-bar off --user ansible boto3
+
+      - name: Redeploy
+        run: |
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          eval $(ssh-agent)
+          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
+          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+          chmod 0600 ssh-key
+          ssh-add ssh-key
+          rm -f ssh-key ssh-key-cert.pub
+
+          ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-proxy:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    # Compute image isn't strictly required for proxy deploy, but let's still wait for it
+    # to run all deploy jobs consistently.
+    needs: [ docker-image, docker-image-compute, calculate-deploy-targets ]
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      github.event_name != 'workflow_dispatch'
+    strategy:
+      matrix:
+        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Setup helm v3
+        run: |
+          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{needs.docker-image.outputs.build-tag}}
+          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace default --install -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
--- a/.github/workflows/codestyle.yml
+++ b/.github/workflows/codestyle.yml
@@ -1,4 +1,4 @@
-name: Build and Test
+name: Check code style and build

 on:
  push:
@@ -6,15 +6,28 @@ on:
    - main
  pull_request:

+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+env:
+  RUST_BACKTRACE: 1
+
 jobs:
-  regression-check:
+  check-codestyle-rust:
    strategy:
+      fail-fast: false
      matrix:
        # If we want to duplicate this job for different
        # Rust toolchains (e.g. nightly or 1.37.0), add them here.
-        rust_toolchain: [stable]
+        rust_toolchain: [1.58]
        os: [ubuntu-latest, macos-latest]
-    timeout-minutes: 30
+    timeout-minutes: 60
    name: run regression test suite
    runs-on: ${{ matrix.os }}

@@ -85,12 +98,38 @@ jobs:
        with:
          path: |
            ~/.cargo/registry
+            !~/.cargo/registry/src
            ~/.cargo/git
            target
-          key: ${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}
+          key: v1-${{ runner.os }}-cargo-${{ hashFiles('./Cargo.lock') }}-rust-${{ matrix.rust_toolchain }}

      - name: Run cargo clippy
        run: ./run_clippy.sh

-      - name: Run cargo test
-        run: cargo test --all --all-targets
+      - name: Ensure all project builds
+        run: cargo build --all --all-targets
+
+  check-codestyle-python:
+    runs-on: [ self-hosted, Linux, k8s-runner ]
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: false
+          fetch-depth: 1
+
+      - name: Cache poetry deps
+        id: cache_poetry
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v1-codestyle-python-deps-${{ hashFiles('poetry.lock') }}
+
+      - name: Install Python deps
+        run: ./scripts/pysync
+
+      - name: Run yapf to ensure code format
+        run: poetry run yapf --recursive --diff .
+
+      - name: Run mypy to check types
+        run: poetry run mypy .
--- a/.github/workflows/pg_clients.yml
+++ b/.github/workflows/pg_clients.yml
@@ -0,0 +1,72 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+
+  workflow_dispatch:
+
+concurrency:
+  # Allow only one workflow per any non-`main` branch.
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.ref == 'refs/heads/main' && github.sha || 'anysha' }}
+  cancel-in-progress: true
+
+jobs:
+  test-postgres-client-libs:
+    runs-on: [ ubuntu-latest ]
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+
+    - uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+
+    - name: Install Poetry
+      uses: snok/install-poetry@v1
+
+    - name: Cache poetry deps
+      id: cache_poetry
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pypoetry/virtualenvs
+        key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+
+    - name: Install Python deps
+      shell: bash -euxo pipefail {0}
+      run: ./scripts/pysync
+
+    - name: Run pytest
+      env:
+        REMOTE_ENV: 1
+        BENCHMARK_CONNSTR: "${{ secrets.BENCHMARK_STAGING_CONNSTR }}"
+        TEST_OUTPUT: /tmp/test_output
+        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      shell: bash -euxo pipefail {0}
+      run: |
+        # Test framework expects we have psql binary;
+        # but since we don't really need it in this test, let's mock it
+        mkdir -p "$POSTGRES_DISTRIB_DIR/bin" && touch "$POSTGRES_DISTRIB_DIR/bin/psql";
+        ./scripts/pytest \
+          --junitxml=$TEST_OUTPUT/junit.xml \
+          --tb=short \
+          --verbose \
+          -m "remote_cluster" \
+          -rA "test_runner/pg_clients"
+
+    - name: Post to a Slack channel
+      if: failure()
+      id: slack
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -5,8 +5,9 @@
 __pycache__/
 test_output/
 .vscode
-/.zenith
-/integration_tests/.zenith
+.idea
+/.neon
+/integration_tests/.neon

 # Coverage
 *.profraw
--- a/.yapfignore
+++ b/.yapfignore
@@ -6,5 +6,5 @@ target/
 tmp_install/
 __pycache__/
 test_output/
-.zenith/
+.neon/
 .git/
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,17 +11,15 @@ than it was before.

 ## Submitting changes

-1. Make a PR for every change.
-
-   Even seemingly trivial patches can break things in surprising ways.
-Use of common sense is OK. If you're only fixing a typo in a comment,
-it's probably fine to just push it. But if in doubt, open a PR.
-
-2. Get at least one +1 on your PR before you push.
+1. Get at least one +1 on your PR before you push.

   For simple patches, it will only take a minute for someone to review
 it.

+2. Don't force push small changes after making the PR ready for review.
+Doing so will force readers to re-read your entire PR, which will delay
+the review process.
+
 3. Always keep the CI green.

   Do not push, if the CI failed on your PR. Even if you think it's not
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/14
+++ b/14
@@ -1,5 +1,5 @@
 # Build Postgres
-FROM zimg/rust:1.58 AS pg-build
+FROM neondatabase/rust:1.58 AS pg-build
 WORKDIR /pg

 USER root
@@ -14,9 +14,13 @@ RUN set -e \
    && tar -C tmp_install -czf /postgres_install.tar.gz .

 # Build zenith binaries
-FROM zimg/rust:1.58 AS build
+FROM neondatabase/rust:1.58 AS build
 ARG GIT_VERSION=local

+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
+ARG RUSTC_WRAPPER=cachepot
 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
 ARG AWS_ACCESS_KEY_ID
 ARG AWS_SECRET_ACCESS_KEY
@@ -46,9 +50,9 @@ RUN set -e \
    && useradd -d /data zenith \
    && chown -R zenith:zenith /data

-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/pageserver /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/safekeeper /usr/local/bin
-COPY --from=build --chown=zenith:zenith /home/circleci/project/target/release/proxy      /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/pageserver /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/safekeeper /usr/local/bin
+COPY --from=build --chown=zenith:zenith /home/runner/target/release/proxy      /usr/local/bin

 COPY --from=pg-build /pg/tmp_install/         /usr/local/
 COPY --from=pg-build /postgres_install.tar.gz /data/
--- a/Dockerfile.compute-tools
+++ b/Dockerfile.compute-tools
@@ -1,7 +1,11 @@
 # First transient image to build compute_tools binaries
-# NB: keep in sync with rust image version in .circle/config.yml
-FROM zimg/rust:1.58 AS rust-build
+# NB: keep in sync with rust image version in .github/workflows/build_and_test.yml
+FROM neondatabase/rust:1.58 AS rust-build

+# Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
+# Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
+# cachepot falls back to local filesystem if S3 is misconfigured, not failing the build.
+ARG RUSTC_WRAPPER=cachepot
 ARG CACHEPOT_BUCKET=zenith-rust-cachepot
 ARG AWS_ACCESS_KEY_ID
 ARG AWS_SECRET_ACCESS_KEY
@@ -15,4 +19,4 @@ RUN set -e \
 # Final image that only has one binary
 FROM debian:buster-slim

-COPY --from=rust-build /home/circleci/project/target/release/compute_ctl /usr/local/bin/compute_ctl
+COPY --from=rust-build /home/runner/target/release/compute_ctl /usr/local/bin/compute_ctl
--- a/43
+++ b/43
@@ -1,3 +1,8 @@
+ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+
+# Where to install Postgres, default is ./tmp_install, maybe useful for package managers
+POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/tmp_install
+
 # Seccomp BPF is only available for Linux
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Linux)
@@ -24,9 +29,11 @@ else
 endif

 # macOS with brew-installed openssl requires explicit paths
+# It can be configured with OPENSSL_PREFIX variable
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Darwin)
-    PG_CONFIGURE_OPTS += --with-includes=$(HOMEBREW_PREFIX)/opt/openssl/include --with-libraries=$(HOMEBREW_PREFIX)/opt/openssl/lib
+    OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+    PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
 endif

 # Choose whether we should be silent or verbose
@@ -55,55 +62,55 @@ zenith: postgres-headers
 	$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)

 ### PostgreSQL parts
-tmp_install/build/config.status:
+$(POSTGRES_INSTALL_DIR)/build/config.status:
 	+@echo "Configuring postgres build"
-	mkdir -p tmp_install/build
-	(cd tmp_install/build && \
-	../../vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
+	mkdir -p $(POSTGRES_INSTALL_DIR)/build
+	(cd $(POSTGRES_INSTALL_DIR)/build && \
+	$(ROOT_PROJECT_DIR)/vendor/postgres/configure CFLAGS='$(PG_CFLAGS)' \
 		$(PG_CONFIGURE_OPTS) \
 		$(SECCOMP) \
-		--prefix=$(abspath tmp_install) > configure.log)
+		--prefix=$(abspath $(POSTGRES_INSTALL_DIR)) > configure.log)

 # nicer alias for running 'configure'
 .PHONY: postgres-configure
-postgres-configure: tmp_install/build/config.status
+postgres-configure: $(POSTGRES_INSTALL_DIR)/build/config.status

-# Install the PostgreSQL header files into tmp_install/include
+# Install the PostgreSQL header files into $(POSTGRES_INSTALL_DIR)/include
 .PHONY: postgres-headers
 postgres-headers: postgres-configure
 	+@echo "Installing PostgreSQL headers"
-	$(MAKE) -C tmp_install/build/src/include MAKELEVEL=0 install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/src/include MAKELEVEL=0 install

 # Compile and install PostgreSQL and contrib/neon
 .PHONY: postgres
 postgres: postgres-configure \
 		  postgres-headers # to prevent `make install` conflicts with zenith's `postgres-headers`
 	+@echo "Compiling PostgreSQL"
-	$(MAKE) -C tmp_install/build MAKELEVEL=0 install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 install
 	+@echo "Compiling contrib/neon"
-	$(MAKE) -C tmp_install/build/contrib/neon install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon install
 	+@echo "Compiling contrib/neon_test_utils"
-	$(MAKE) -C tmp_install/build/contrib/neon_test_utils install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/neon_test_utils install
 	+@echo "Compiling pg_buffercache"
-	$(MAKE) -C tmp_install/build/contrib/pg_buffercache install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect"
-	$(MAKE) -C tmp_install/build/contrib/pageinspect install
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/contrib/pageinspect install


 .PHONY: postgres-clean
 postgres-clean:
-	$(MAKE) -C tmp_install/build MAKELEVEL=0 clean
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build MAKELEVEL=0 clean

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
 clean:
-	cd tmp_install/build && $(MAKE) clean
+	cd $(POSTGRES_INSTALL_DIR)/build && $(MAKE) clean
 	$(CARGO_CMD_PREFIX) cargo clean

 # This removes everything
 .PHONY: distclean
 distclean:
-	rm -rf tmp_install
+	rm -rf $(POSTGRES_INSTALL_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean

 .PHONY: fmt
@@ -112,4 +119,4 @@ fmt:

 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
-	ln -s -f ../../pre-commit.py .git/hooks/pre-commit
+	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Neon

-Neon is a serverless open source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes PostgreSQL storage layer by redistributing data across a cluster of nodes.
+Neon is a serverless open-source alternative to AWS Aurora Postgres. It separates storage and compute and substitutes the PostgreSQL storage layer by redistributing data across a cluster of nodes.

 The project used to be called "Zenith". Many of the commands and code comments
 still refer to "zenith", but we are in the process of renaming things.
@@ -12,32 +12,31 @@ Alternatively, compile and run the project [locally](#running-local-installation

 ## Architecture overview

-A Neon installation consists of compute nodes and Neon storage engine.
+A Neon installation consists of compute nodes and a Neon storage engine.

-Compute nodes are stateless PostgreSQL nodes, backed by Neon storage engine.
+Compute nodes are stateless PostgreSQL nodes backed by the Neon storage engine.

-Neon storage engine consists of two major components:
- Pageserver. Scalable storage backend for compute nodes.
- WAL service. The service that receives WAL from compute node and ensures that it is stored durably.
+The Neon storage engine consists of two major components:
+- Pageserver. Scalable storage backend for the compute nodes.
+- WAL service. The service receives WAL from the compute node and ensures that it is stored durably.

 Pageserver consists of:
 - Repository - Neon storage implementation.
 - WAL receiver - service that receives WAL from WAL service and stores it in the repository.
 - Page service - service that communicates with compute nodes and responds with pages from the repository.
- WAL redo - service that builds pages from base images and WAL records on Page service request.
-
+- WAL redo - service that builds pages from base images and WAL records on Page service request
 ## Running local installation


-#### building on Linux
-1. Install build dependencies and other useful packages
+#### Installing dependencies on Linux
+1. Install build dependencies and other applicable packages

-* On Ubuntu or Debian this set of packages should be sufficient to build the code:
+* On Ubuntu or Debian, this set of packages should be sufficient to build the code:
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev etcd cmake postgresql-client
 ```
-* On Fedora these packages are needed:
+* On Fedora, these packages are needed:
 ```bash
 dnf install flex bison readline-devel zlib-devel openssl-devel \
  libseccomp-devel perl clang cmake etcd postgresql postgresql-contrib
@@ -49,18 +48,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 ```

-3. Build neon and patched postgres
-```sh
-git clone --recursive https://github.com/neondatabase/neon.git
-cd neon
-make -j`nproc`
-```
-
-#### building on OSX (12.3.1)
+#### Installing dependencies on OSX (12.3.1)
 1. Install XCode and dependencies
 ```
 xcode-select --install
-brew install protobuf etcd
+brew install protobuf etcd openssl
 ```

 2. [Install Rust](https://www.rust-lang.org/tools/install)
@@ -76,24 +68,49 @@ brew install libpq
 brew link --force libpq
 ```

-4. Build neon and patched postgres
-```sh
+#### Building on Linux
+
+1. Build neon and patched postgres
+```
+# Note: The path to the neon sources can not contain a space.
+
 git clone --recursive https://github.com/neondatabase/neon.git
 cd neon
-make -j5
+
+# The preferred and default is to make a debug build. This will create a 
+# demonstrably slower build than a release build. If you want to use a release
+# build, utilize "BUILD_TYPE=release make -j`nproc`" 
+
+make -j`nproc`
 ```

-#### dependency installation notes
+#### Building on OSX
+
+1. Build neon and patched postgres
+```
+# Note: The path to the neon sources can not contain a space.
+
+git clone --recursive https://github.com/neondatabase/neon.git
+cd neon
+
+# The preferred and default is to make a debug build. This will create a 
+# demonstrably slower build than a release build. If you want to use a release
+# build, utilize "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`" 
+
+make -j`sysctl -n hw.logicalcpu`
+```
+
+#### Dependency installation notes
 To run the `psql` client, install the `postgresql-client` package or modify `PATH` and `LD_LIBRARY_PATH` to include `tmp_install/bin` and `tmp_install/lib`, respectively.

 To run the integration tests or Python scripts (not required to use the code), install
-Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires poetry) in the project directory.
+Python (3.9 or higher), and install python3 packages using `./scripts/pysync` (requires [poetry](https://python-poetry.org/)) in the project directory.


-#### running neon database
+#### Running neon database
 1. Start pageserver and postgres on top of it (should be called from repo root):
 ```sh
-# Create repository in .zenith with proper paths to binaries and data
+# Create repository in .neon with proper paths to binaries and data
 # Later that would be responsibility of a package install script
 > ./target/debug/neon_local init
 initializing tenantid 9ef87a5bf0d92544f6fafeeb3239695c
@@ -103,16 +120,16 @@ pageserver init succeeded

 # start pageserver and safekeeper
 > ./target/debug/neon_local start
-Starting pageserver at '127.0.0.1:64000' in '.zenith'
+Starting pageserver at '127.0.0.1:64000' in '.neon'
 Pageserver started
 initializing for sk 1 for 7676
-Starting safekeeper at '127.0.0.1:5454' in '.zenith/safekeepers/sk1'
+Starting safekeeper at '127.0.0.1:5454' in '.neon/safekeepers/sk1'
 Safekeeper started

 # start postgres compute node
 > ./target/debug/neon_local pg start main
 Starting new postgres main on timeline de200bd42b49cc1814412c7e592dd6e9 ...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/main port=55432
 Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=postgres'

 # check list of running postgres instances
@@ -121,7 +138,7 @@ Starting postgres node at 'host=127.0.0.1 port=55432 user=cloud_admin dbname=pos
 main  127.0.0.1:55432  de200bd42b49cc1814412c7e592dd6e9  main         0/16B5BA8  running
 ```

-2. Now it is possible to connect to postgres and run some queries:
+2. Now, it is possible to connect to postgres and run some queries:
 ```text
 > psql -p55432 -h 127.0.0.1 -U cloud_admin postgres
 postgres=# CREATE TABLE t(key int primary key, value text);
@@ -149,7 +166,7 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 # start postgres on that branch
 > ./target/debug/neon_local pg start migration_check --branch-name migration_check
 Starting new postgres migration_check on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
-Extracting base backup to create postgres instance: path=.zenith/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
+Extracting base backup to create postgres instance: path=.neon/pgdatadirs/tenants/9ef87a5bf0d92544f6fafeeb3239695c/migration_check port=55433
 Starting postgres node at 'host=127.0.0.1 port=55433 user=cloud_admin dbname=postgres'

 # check the new list of running postgres instances
@@ -179,14 +196,16 @@ postgres=# select * from t;
 (1 row)
 ```

-4. If you want to run tests afterwards (see below), you have to stop all the running the pageserver, safekeeper and postgres instances
-   you have just started. You can stop them all with one command:
+4. If you want to run tests afterward (see below), you must stop all the running of the pageserver, safekeeper, and postgres instances
+   you have just started. You can terminate them all with one command:
 ```sh
 > ./target/debug/neon_local stop
 ```

 ## Running tests

+Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
+
 ```sh
 git clone --recursive https://github.com/neondatabase/neon.git
 make # builds also postgres and installs it to ./tmp_install
@@ -203,13 +222,13 @@ To view your `rustdoc` documentation in a browser, try running `cargo doc --no-d

 ### Postgres-specific terms

-Due to Neon's very close relation with PostgreSQL internals, there are numerous specific terms used.
-Same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.
+Due to Neon's very close relation with PostgreSQL internals, numerous specific terms are used.
+The same applies to certain spelling: i.e. we use MB to denote 1024 * 1024 bytes, while MiB would be technically more correct, it's inconsistent with what PostgreSQL code and its documentation use.

 To get more familiar with this aspect, refer to:

 - [Neon glossary](/docs/glossary.md)
- [PostgreSQL glossary](https://www.postgresql.org/docs/13/glossary.html)
+- [PostgreSQL glossary](https://www.postgresql.org/docs/14/glossary.html)
 - Other PostgreSQL documentation and sources (Neon fork sources can be found [here](https://github.com/neondatabase/postgres))

 ## Join the development
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -4,7 +4,6 @@ version = "0.1.0"
 edition = "2021"

 [dependencies]
-libc = "0.2"
 anyhow = "1.0"
 chrono = "0.4"
 clap = "3.0"
@@ -18,4 +17,5 @@ serde_json = "1"
 tar = "0.4"
 tokio = { version = "1.17", features = ["macros", "rt", "rt-multi-thread"] }
 tokio-postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+url = "2.2.2"
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -33,7 +33,7 @@ use std::process::exit;
 use std::sync::{Arc, RwLock};
 use std::{thread, time::Duration};

-use anyhow::Result;
+use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
 use log::{error, info};
@@ -45,6 +45,7 @@ use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::pg_helpers::*;
 use compute_tools::spec::*;
+use url::Url;

 fn main() -> Result<()> {
    // TODO: re-use `utils::logging` later
@@ -131,7 +132,7 @@ fn main() -> Result<()> {

    let compute_state = ComputeNode {
        start_time: Utc::now(),
-        connstr: connstr.to_string(),
+        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
        pgdata: pgdata.to_string(),
        pgbin: pgbin.to_string(),
        spec,
@@ -156,7 +157,7 @@ fn main() -> Result<()> {
            exit(code)
        }
        Err(error) => {
-            error!("could not start the compute node: {}", error);
+            error!("could not start the compute node: {:?}", error);

            let mut state = compute.state.write().unwrap();
            state.error = Some(format!("{:?}", error));
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,5 +1,3 @@
-use std::sync::Arc;
-
 use anyhow::{anyhow, Result};
 use log::error;
 use postgres::Client;
@@ -23,9 +21,8 @@ pub fn create_writablity_check_data(client: &mut Client) -> Result<()> {
    Ok(())
 }

-pub async fn check_writability(compute: &Arc<ComputeNode>) -> Result<()> {
-    let connstr = &compute.connstr;
-    let (client, connection) = tokio_postgres::connect(connstr, NoTls).await?;
+pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
+    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
    if client.is_closed() {
        return Err(anyhow!("connection to postgres closed"));
    }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -35,7 +35,8 @@ use crate::spec::*;
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
    pub start_time: DateTime<Utc>,
-    pub connstr: String,
+    // Url type maintains proper escaping
+    pub connstr: url::Url,
    pub pgdata: String,
    pub pgbin: String,
    pub spec: ComputeSpec,
@@ -268,28 +269,33 @@ impl ComputeNode {
        // In this case we need to connect with old `zenith_admin`name
        // and create new user. We cannot simply rename connected user,
        // but we can create a new one and grant it all privileges.
-        let mut client = match Client::connect(&self.connstr, NoTls) {
+        let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
            Err(e) => {
                info!(
                    "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                    e
                );
-                let zenith_admin_connstr = self.connstr.replacen("cloud_admin", "zenith_admin", 1);
+                let mut zenith_admin_connstr = self.connstr.clone();

-                let mut client = Client::connect(&zenith_admin_connstr, NoTls)?;
+                zenith_admin_connstr
+                    .set_username("zenith_admin")
+                    .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+
+                let mut client = Client::connect(zenith_admin_connstr.as_str(), NoTls)?;
                client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                drop(client);

                // reconnect with connsting with expected name
-                Client::connect(&self.connstr, NoTls)?
+                Client::connect(self.connstr.as_str(), NoTls)?
            }
            Ok(client) => client,
        };

        handle_roles(&self.spec, &mut client)?;
        handle_databases(&self.spec, &mut client)?;
-        handle_grants(&self.spec, &mut client)?;
+        handle_role_deletions(self, &mut client)?;
+        handle_grants(self, &mut client)?;
        create_writablity_check_data(&mut client)?;

        // 'Close' connection
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -13,11 +13,11 @@ const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
 // XXX: the only expected panic is at `RwLock` unwrap().
-fn watch_compute_activity(compute: &Arc<ComputeNode>) {
+fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.clone();
+    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = Client::connect(&connstr, NoTls);
+    let mut client = Client::connect(connstr, NoTls);
    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);

    info!("watching Postgres activity at {}", connstr);
@@ -32,7 +32,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
                    info!("connection to postgres closed, trying to reconnect");

                    // Connection is closed, reconnect and try again.
-                    client = Client::connect(&connstr, NoTls);
+                    client = Client::connect(connstr, NoTls);
                    continue;
                }

@@ -93,7 +93,7 @@ fn watch_compute_activity(compute: &Arc<ComputeNode>) {
                debug!("cannot connect to postgres: {}, retrying", e);

                // Establish a new connection and try again.
-                client = Client::connect(&connstr, NoTls);
+                client = Client::connect(connstr, NoTls);
            }
        }
    }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -1,3 +1,4 @@
+use std::fmt::Write;
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::net::{SocketAddr, TcpStream};
@@ -138,9 +139,11 @@ impl Role {
            // Now we also support SCRAM-SHA-256 and to preserve compatibility
            // we treat all encrypted_password as md5 unless they starts with SCRAM-SHA-256.
            if pass.starts_with("SCRAM-SHA-256") {
-                params.push_str(&format!(" PASSWORD '{}'", pass));
+                write!(params, " PASSWORD '{pass}'")
+                    .expect("String is documented to not to error during write operations");
            } else {
-                params.push_str(&format!(" PASSWORD 'md5{}'", pass));
+                write!(params, " PASSWORD 'md5{pass}'")
+                    .expect("String is documented to not to error during write operations");
            }
        } else {
            params.push_str(" PASSWORD NULL");
@@ -158,7 +161,8 @@ impl Database {
    /// it may require a proper quoting too.
    pub fn to_pg_options(&self) -> String {
        let mut params: String = self.options.as_pg_options();
-        params.push_str(&format!(" OWNER {}", &self.owner.quote()));
+        write!(params, " OWNER {}", &self.owner.quote())
+            .expect("String is documented to not to error during write operations");

        params
    }
@@ -244,18 +248,20 @@ pub fn wait_for_postgres(pg: &mut Child, port: &str, pgdata: &Path) -> Result<()
            bail!("Postgres exited unexpectedly with code {}", code);
        }

-        if pid_path.exists() {
-            let file = BufReader::new(File::open(&pid_path)?);
-            let status = file
-                .lines()
-                .last()
-                .unwrap()
-                .unwrap_or_else(|_| "unknown".to_string());
-            let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
+        // Check that we can open pid file first.
+        if let Ok(file) = File::open(&pid_path) {
+            let file = BufReader::new(file);
+            let last_line = file.lines().last();

-            // Now Postgres is ready to accept connections
-            if status.trim() == "ready" && can_connect {
-                break;
+            // Pid file could be there and we could read it, but it could be empty, for example.
+            if let Some(Ok(line)) = last_line {
+                let status = line.trim();
+                let can_connect = TcpStream::connect_timeout(&addr, timeout).is_ok();
+
+                // Now Postgres is ready to accept connections
+                if status == "ready" && can_connect {
+                    break;
+                }
            }
        }

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,9 +2,10 @@ use std::path::Path;

 use anyhow::Result;
 use log::{info, log_enabled, warn, Level};
-use postgres::Client;
+use postgres::{Client, NoTls};
 use serde::Deserialize;

+use crate::compute::ComputeNode;
 use crate::config;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
@@ -97,18 +98,13 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Process delta operations first
    if let Some(ops) = &spec.delta_operations {
-        info!("processing delta operations on roles");
+        info!("processing role renames");
        for op in ops {
            match op.action.as_ref() {
-                // We do not check either role exists or not,
-                // Postgres will take care of it for us
                "delete_role" => {
-                    let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
-
-                    warn!("deleting role '{}'", &op.name);
-                    xact.execute(query.as_str(), &[])?;
+                    // no-op now, roles will be deleted at the end of configuration
                }
-                // Renaming role drops its password, since tole name is
+                // Renaming role drops its password, since role name is
                // used as a salt there.  It is important that this role
                // is recorded with a new `name` in the `roles` list.
                // Follow up roles update will set the new password.
@@ -182,7 +178,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            xact.execute(query.as_str(), &[])?;

            let grant_query = format!(
-                "grant pg_read_all_data, pg_write_all_data to {}",
+                "GRANT pg_read_all_data, pg_write_all_data TO {}",
                name.quote()
            );
            xact.execute(grant_query.as_str(), &[])?;
@@ -197,6 +193,70 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    Ok(())
 }

+/// Reassign all dependent objects and delete requested roles.
+pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
+    let spec = &node.spec;
+
+    // First, reassign all dependent objects to db owners.
+    if let Some(ops) = &spec.delta_operations {
+        info!("reassigning dependent objects of to-be-deleted roles");
+        for op in ops {
+            if op.action == "delete_role" {
+                reassign_owned_objects(node, &op.name)?;
+            }
+        }
+    }
+
+    // Second, proceed with role deletions.
+    let mut xact = client.transaction()?;
+    if let Some(ops) = &spec.delta_operations {
+        info!("processing role deletions");
+        for op in ops {
+            // We do not check either role exists or not,
+            // Postgres will take care of it for us
+            if op.action == "delete_role" {
+                let query: String = format!("DROP ROLE IF EXISTS {}", &op.name.quote());
+
+                warn!("deleting role '{}'", &op.name);
+                xact.execute(query.as_str(), &[])?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+// Reassign all owned objects in all databases to the owner of the database.
+fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()> {
+    for db in &node.spec.cluster.databases {
+        if db.owner != *role_name {
+            let mut connstr = node.connstr.clone();
+            // database name is always the last and the only component of the path
+            connstr.set_path(&db.name);
+
+            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+
+            // This will reassign all dependent objects to the db owner
+            let reassign_query = format!(
+                "REASSIGN OWNED BY {} TO {}",
+                role_name.quote(),
+                db.owner.quote()
+            );
+            info!(
+                "reassigning objects owned by '{}' in db '{}' to '{}'",
+                role_name, &db.name, &db.owner
+            );
+            client.simple_query(&reassign_query)?;
+
+            // This now will only drop privileges of the role
+            let drop_query = format!("DROP OWNED BY {}", role_name.quote());
+            client.simple_query(&drop_query)?;
+        }
+    }
+
+    Ok(())
+}
+
 /// It follows mostly the same logic as `handle_roles()` excepting that we
 /// does not use an explicit transactions block, since major database operations
 /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
@@ -289,23 +349,80 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    Ok(())
 }

-// Grant CREATE ON DATABASE to the database owner
-// to allow clients create trusted extensions.
-pub fn handle_grants(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
+/// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
+/// to allow users creating trusted extensions and re-creating `public` schema, for example.
+pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
+    let spec = &node.spec;
+
    info!("cluster spec grants:");

+    // We now have a separate `web_access` role to connect to the database
+    // via the web interface and proxy link auth. And also we grant a
+    // read / write all data privilege to every role. So also grant
+    // create to everyone.
+    // XXX: later we should stop messing with Postgres ACL in such horrible
+    // ways.
+    let roles = spec
+        .cluster
+        .roles
+        .iter()
+        .map(|r| r.name.quote())
+        .collect::<Vec<_>>();
+
    for db in &spec.cluster.databases {
        let dbname = &db.name;

        let query: String = format!(
            "GRANT CREATE ON DATABASE {} TO {}",
            dbname.quote(),
-            db.owner.quote()
+            roles.join(", ")
        );
        info!("grant query {}", &query);

        client.execute(query.as_str(), &[])?;
    }

+    // Do some per-database access adjustments. We'd better do this at db creation time,
+    // but CREATE DATABASE isn't transactional. So we cannot create db + do some grants
+    // atomically.
+    let mut db_connstr = node.connstr.clone();
+    for db in &node.spec.cluster.databases {
+        // database name is always the last and the only component of the path
+        db_connstr.set_path(&db.name);
+
+        let mut db_client = Client::connect(db_connstr.as_str(), NoTls)?;
+
+        // This will only change ownership on the schema itself, not the objects
+        // inside it. Without it owner of the `public` schema will be `cloud_admin`
+        // and database owner cannot do anything with it. SQL procedure ensures
+        // that it won't error out if schema `public` doesn't exist.
+        let alter_query = format!(
+            "DO $$\n\
+                DECLARE\n\
+                    schema_owner TEXT;\n\
+                BEGIN\n\
+                    IF EXISTS(\n\
+                        SELECT nspname\n\
+                        FROM pg_catalog.pg_namespace\n\
+                        WHERE nspname = 'public'\n\
+                    )\n\
+                    THEN\n\
+                        SELECT nspowner::regrole::text\n\
+                            FROM pg_catalog.pg_namespace\n\
+                            WHERE nspname = 'public'\n\
+                            INTO schema_owner;\n\
+                \n\
+                        IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'\n\
+                        THEN\n\
+                            ALTER SCHEMA public OWNER TO {};\n\
+                        END IF;\n\
+                    END IF;\n\
+                END\n\
+            $$;",
+            db.owner.quote()
+        );
+        db_client.simple_query(&alter_query)?;
+    }
+
    Ok(())
 }
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -9,12 +9,11 @@ postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "1.12.0"
 toml = "0.5"
-lazy_static = "1.4"
+once_cell = "1.13.0"
 regex = "1"
 anyhow = "1.0"
 thiserror = "1"
 nix = "0.23"
-url = "2.2.2"
 reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

 pageserver = { path = "../pageserver" }
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -30,14 +30,14 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    let etcd_stdout_file =
        fs::File::create(etcd_data_dir.join("etcd.stdout.log")).with_context(|| {
            format!(
-                "Failed to create ectd stout file in directory {}",
+                "Failed to create etcd stout file in directory {}",
                etcd_data_dir.display()
            )
        })?;
    let etcd_stderr_file =
        fs::File::create(etcd_data_dir.join("etcd.stderr.log")).with_context(|| {
            format!(
-                "Failed to create ectd stderr file in directory {}",
+                "Failed to create etcd stderr file in directory {}",
                etcd_data_dir.display()
            )
        })?;
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -51,7 +51,11 @@ fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
 }

 fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
-    for env_key in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"] {
+    for env_key in [
+        "AWS_ACCESS_KEY_ID",
+        "AWS_SECRET_ACCESS_KEY",
+        "AWS_SESSION_TOKEN",
+    ] {
        if let Ok(value) = std::env::var(env_key) {
            cmd = cmd.env(env_key, value);
        }
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -21,9 +21,9 @@ use utils::{
 use crate::safekeeper::SafekeeperNode;

 //
-// This data structures represents zenith CLI config
+// This data structures represents neon_local CLI config
 //
-// It is deserialized from the .zenith/config file, or the config file passed
+// It is deserialized from the .neon/config file, or the config file passed
 // to 'zenith init --config=<path>' option. See control_plane/simple.conf for
 // an example.
 //
@@ -34,8 +34,8 @@ pub struct LocalEnv {
    // compute nodes).
    //
    // This is not stored in the config file. Rather, this is the path where the
-    // config file itself is. It is read from the ZENITH_REPO_DIR env variable or
-    // '.zenith' if not given.
+    // config file itself is. It is read from the NEON_REPO_DIR env variable or
+    // '.neon' if not given.
    #[serde(skip)]
    pub base_data_dir: PathBuf,

@@ -177,6 +177,7 @@ pub struct SafekeeperConf {
    pub sync: bool,
    pub remote_storage: Option<String>,
    pub backup_threads: Option<u32>,
+    pub auth_enabled: bool,
 }

 impl Default for SafekeeperConf {
@@ -188,6 +189,7 @@ impl Default for SafekeeperConf {
            sync: true,
            remote_storage: None,
            backup_threads: None,
+            auth_enabled: false,
        }
    }
 }
@@ -337,7 +339,7 @@ impl LocalEnv {
    pub fn persist_config(&self, base_path: &Path) -> anyhow::Result<()> {
        // Currently, the user first passes a config file with 'zenith init --config=<path>'
        // We read that in, in `create_config`, and fill any missing defaults. Then it's saved
-        // to .zenith/config. TODO: We lose any formatting and comments along the way, which is
+        // to .neon/config. TODO: We lose any formatting and comments along the way, which is
        // a bit sad.
        let mut conf_content = r#"# This file describes a locale deployment of the page server
 # and safekeeeper node. It is read by the 'zenith' command-line
@@ -401,16 +403,6 @@ impl LocalEnv {
                self.pg_distrib_dir.display()
            );
        }
-        for binary in ["pageserver", "safekeeper"] {
-            if !self.zenith_distrib_dir.join(binary).exists() {
-                bail!(
-                    "Can't find binary '{}' in zenith distrib dir '{}'",
-                    binary,
-                    self.zenith_distrib_dir.display()
-                );
-            }
-        }
-
        for binary in ["pageserver", "safekeeper"] {
            if !self.zenith_distrib_dir.join(binary).exists() {
                bail!(
@@ -419,12 +411,6 @@ impl LocalEnv {
                );
            }
        }
-        if !self.pg_distrib_dir.join("bin/postgres").exists() {
-            bail!(
-                "Can't find postgres binary at {}",
-                self.pg_distrib_dir.display()
-            );
-        }

        fs::create_dir(&base_path)?;

@@ -481,9 +467,9 @@ impl LocalEnv {
 }

 fn base_path() -> PathBuf {
-    match std::env::var_os("ZENITH_REPO_DIR") {
+    match std::env::var_os("NEON_REPO_DIR") {
        Some(val) => PathBuf::from(val),
-        None => PathBuf::from(".zenith"),
+        None => PathBuf::from(".neon"),
    }
 }

--- a/control_plane/src/postgresql_conf.rs
+++ b/control_plane/src/postgresql_conf.rs
@@ -5,7 +5,7 @@
 /// enough to extract a few settings we need in Zenith, assuming you don't do
 /// funny stuff like include-directives or funny escaping.
 use anyhow::{bail, Context, Result};
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
 use regex::Regex;
 use std::collections::HashMap;
 use std::fmt;
@@ -19,9 +19,7 @@ pub struct PostgresConf {
    hash: HashMap<String, String>,
 }

-lazy_static! {
-    static ref CONF_LINE_RE: Regex = Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap();
-}
+static CONF_LINE_RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"^((?:\w|\.)+)\s*=\s*(\S+)$").unwrap());

 impl PostgresConf {
    pub fn new() -> PostgresConf {
@@ -139,10 +137,10 @@ fn escape_str(s: &str) -> String {
    //
    // This regex is a bit more conservative than the rules in guc-file.l, so we quote some
    // strings that PostgreSQL would accept without quoting, but that's OK.
-    lazy_static! {
-        static ref UNQUOTED_RE: Regex =
-            Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap();
-    }
+
+    static UNQUOTED_RE: Lazy<Regex> =
+        Lazy::new(|| Regex::new(r"(^[-+]?[0-9]+[a-zA-Z]*$)|(^[a-zA-Z][a-zA-Z0-9]*$)").unwrap());
+
    if UNQUOTED_RE.is_match(s) {
        s.to_string()
    } else {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -149,6 +149,11 @@ impl SafekeeperNode {
        if let Some(ref remote_storage) = self.conf.remote_storage {
            cmd.args(&["--remote-storage", remote_storage]);
        }
+        if self.conf.auth_enabled {
+            cmd.arg("--auth-validation-public-key-path");
+            // PathBuf is better be passed as is, not via `String`.
+            cmd.arg(self.env.base_data_dir.join("auth_public_key.pem"));
+        }

        fill_aws_secrets_vars(&mut cmd);

@@ -242,7 +247,7 @@ impl SafekeeperNode {
        // Shutting down may take a long time,
        // if safekeeper flushes a lot of data
        let mut tcp_stopped = false;
-        for _ in 0..100 {
+        for i in 0..600 {
            if !tcp_stopped {
                if let Err(err) = TcpStream::connect(&address) {
                    tcp_stopped = true;
@@ -267,9 +272,11 @@ impl SafekeeperNode {
                    }
                }
            }
-            print!(".");
-            io::stdout().flush().unwrap();
-            thread::sleep(Duration::from_secs(1));
+            if i % 10 == 0 {
+                print!(".");
+                io::stdout().flush().unwrap();
+            }
+            thread::sleep(Duration::from_millis(100));
        }

        bail!("Failed to stop safekeeper with pid {}", pid);
@@ -299,10 +306,9 @@ impl SafekeeperNode {
        Ok(self
            .http_request(
                Method::POST,
-                format!("{}/{}", self.http_base_url, "timeline"),
+                format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
            )
            .json(&TimelineCreateRequest {
-                tenant_id,
                timeline_id,
                peer_ids,
            })
--- a/control_plane/src/storage.rs
+++ b/control_plane/src/storage.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
-use std::io::Write;
+use std::fs::File;
+use std::io::{BufReader, Write};
 use std::net::TcpStream;
 use std::num::NonZeroU64;
 use std::path::PathBuf;
@@ -11,9 +12,9 @@ use anyhow::{bail, Context};
 use nix::errno::Errno;
 use nix::sys::signal::{kill, Signal};
 use nix::unistd::Pid;
-use pageserver::http::models::{TenantConfigRequest, TenantCreateRequest, TimelineCreateRequest};
-use pageserver::tenant_mgr::TenantInfo;
-use pageserver::timelines::TimelineInfo;
+use pageserver::http::models::{
+    TenantConfigRequest, TenantCreateRequest, TenantInfo, TimelineCreateRequest, TimelineInfo,
+};
 use postgres::{Config, NoTls};
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
@@ -317,7 +318,7 @@ impl PageServerNode {
        // Shutting down may take a long time,
        // if pageserver checkpoints a lot of data
        let mut tcp_stopped = false;
-        for _ in 0..100 {
+        for i in 0..600 {
            if !tcp_stopped {
                if let Err(err) = TcpStream::connect(&address) {
                    tcp_stopped = true;
@@ -343,9 +344,11 @@ impl PageServerNode {
                    }
                }
            }
-            print!(".");
-            io::stdout().flush().unwrap();
-            thread::sleep(Duration::from_secs(1));
+            if i % 10 == 0 {
+                print!(".");
+                io::stdout().flush().unwrap();
+            }
+            thread::sleep(Duration::from_millis(100));
        }

        bail!("Failed to stop pageserver with pid {}", pid);
@@ -398,6 +401,7 @@ impl PageServerNode {
                    .get("checkpoint_distance")
                    .map(|x| x.parse::<u64>())
                    .transpose()?,
+                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
                compaction_target_size: settings
                    .get("compaction_target_size")
                    .map(|x| x.parse::<u64>())
@@ -452,6 +456,7 @@ impl PageServerNode {
                    .map(|x| x.parse::<u64>())
                    .transpose()
                    .context("Failed to parse 'checkpoint_distance' as an integer")?,
+                checkpoint_timeout: settings.get("checkpoint_timeout").map(|x| x.to_string()),
                compaction_target_size: settings
                    .get("compaction_target_size")
                    .map(|x| x.parse::<u64>())
@@ -527,4 +532,54 @@ impl PageServerNode {

        Ok(timeline_info_response)
    }
+
+    /// Import a basebackup prepared using either:
+    /// a) `pg_basebackup -F tar`, or
+    /// b) The `fullbackup` pageserver endpoint
+    ///
+    /// # Arguments
+    /// * `tenant_id` - tenant to import into. Created if not exists
+    /// * `timeline_id` - id to assign to imported timeline
+    /// * `base` - (start lsn of basebackup, path to `base.tar` file)
+    /// * `pg_wal` - if there's any wal to import: (end lsn, path to `pg_wal.tar`)
+    pub fn timeline_import(
+        &self,
+        tenant_id: ZTenantId,
+        timeline_id: ZTimelineId,
+        base: (Lsn, PathBuf),
+        pg_wal: Option<(Lsn, PathBuf)>,
+    ) -> anyhow::Result<()> {
+        let mut client = self.pg_connection_config.connect(NoTls).unwrap();
+
+        // Init base reader
+        let (start_lsn, base_tarfile_path) = base;
+        let base_tarfile = File::open(base_tarfile_path)?;
+        let mut base_reader = BufReader::new(base_tarfile);
+
+        // Init wal reader if necessary
+        let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
+            let wal_tarfile = File::open(wal_tarfile_path)?;
+            let wal_reader = BufReader::new(wal_tarfile);
+            (end_lsn, Some(wal_reader))
+        } else {
+            (start_lsn, None)
+        };
+
+        // Import base
+        let import_cmd =
+            format!("import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
+        let mut writer = client.copy_in(&import_cmd)?;
+        io::copy(&mut base_reader, &mut writer)?;
+        writer.finish()?;
+
+        // Import wal if necessary
+        if let Some(mut wal_reader) = wal_reader {
+            let import_cmd = format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}");
+            let mut writer = client.copy_in(&import_cmd)?;
+            io::copy(&mut wal_reader, &mut writer)?;
+            writer.finish()?;
+        }
+
+        Ok(())
+    }
 }
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -1,6 +1,8 @@
 #!/bin/sh
 set -eux

+pageserver_id_param="${NODE_ID:-10}"
+
 broker_endpoints_param="${BROKER_ENDPOINT:-absent}"
 if [ "$broker_endpoints_param" != "absent" ]; then
    broker_endpoints_param="-c broker_endpoints=['$broker_endpoints_param']"
@@ -8,10 +10,12 @@ else
    broker_endpoints_param=''
 fi

+remote_storage_param="${REMOTE_STORAGE:-}"
+
 if [ "$1" = 'pageserver' ]; then
    if [ ! -d "/data/tenants" ]; then
        echo "Initializing pageserver data directory"
-        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=10" $broker_endpoints_param
+        pageserver --init -D /data -c "pg_distrib_dir='/usr/local'" -c "id=${pageserver_id_param}" $broker_endpoints_param $remote_storage_param
    fi
    echo "Staring pageserver at 0.0.0.0:6400"
    pageserver -c "listen_pg_addr='0.0.0.0:6400'" -c "listen_http_addr='0.0.0.0:9898'" $broker_endpoints_param -D /data
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+book
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,14 +0,0 @@
-# Zenith documentation
-
-## Table of contents
-
- [authentication.md](authentication.md) — pageserver JWT authentication.
- [docker.md](docker.md) — Docker images and building pipeline.
- [glossary.md](glossary.md) — Glossary of all the terms used in codebase.
- [multitenancy.md](multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
- [sourcetree.md](sourcetree.md) — Overview of the source tree layout.
- [pageserver/README.md](/pageserver/README.md) — pageserver overview.
- [postgres_ffi/README.md](/libs/postgres_ffi/README.md) — Postgres FFI overview.
- [test_runner/README.md](/test_runner/README.md) — tests infrastructure overview.
- [safekeeper/README.md](/safekeeper/README.md) — WAL service overview.
- [core_changes.md](core_changes.md) - Description of Zenith changes in Postgres core
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -0,0 +1,82 @@
+# Summary
+
+[Introduction]()
+- [Separation of Compute and Storage](./separation-compute-storage.md)
+
+# Architecture
+
+- [Compute]()
+  - [WAL proposer]()
+  - [WAL Backpressure]()
+  - [Postgres changes](./core_changes.md)
+
+- [Pageserver](./pageserver.md)
+    - [Services](./pageserver-services.md)
+    - [Thread management](./pageserver-thread-mgmt.md)
+    - [WAL Redo](./pageserver-walredo.md)
+    - [Page cache](./pageserver-pagecache.md)
+    - [Storage](./pageserver-storage.md)
+        - [Datadir mapping]()
+        - [Layer files]()
+        - [Branching]()
+        - [Garbage collection]()
+    - [Cloud Storage]()
+    - [Processing a GetPage request](./pageserver-processing-getpage.md)
+    - [Processing WAL](./pageserver-processing-wal.md)
+	- [Management API]()
+	- [Tenant Rebalancing]()
+
+- [WAL Service](walservice.md)
+  - [Consensus protocol](safekeeper-protocol.md)
+  - [Management API]()
+  - [Rebalancing]()
+
+- [Control Plane]()
+
+- [Proxy]()
+
+- [Source view](./sourcetree.md)
+  - [docker.md](./docker.md) — Docker images and building pipeline.
+  - [Error handling and logging]()
+  - [Testing]()
+    - [Unit testing]()
+    - [Integration testing]()
+    - [Benchmarks]()
+
+
+- [Glossary](./glossary.md)
+
+# Uncategorized
+
+- [authentication.md](./authentication.md)
+- [multitenancy.md](./multitenancy.md) — how multitenancy is organized in the pageserver and Zenith CLI.
+- [settings.md](./settings.md)
+#FIXME: move these under sourcetree.md
+#- [postgres_ffi/README.md](/libs/postgres_ffi/README.md)
+#- [test_runner/README.md](/test_runner/README.md)
+
+
+# RFCs
+
+- [RFCs](./rfcs/README.md)
+
+- [002-storage](rfcs/002-storage.md)
+- [003-laptop-cli](rfcs/003-laptop-cli.md)
+- [004-durability](rfcs/004-durability.md)
+- [005-zenith_local](rfcs/005-zenith_local.md)
+- [006-laptop-cli-v2-CLI](rfcs/006-laptop-cli-v2-CLI.md)
+- [006-laptop-cli-v2-repository-structure](rfcs/006-laptop-cli-v2-repository-structure.md)
+- [007-serverless-on-laptop](rfcs/007-serverless-on-laptop.md)
+- [008-push-pull](rfcs/008-push-pull.md)
+- [009-snapshot-first-storage-cli](rfcs/009-snapshot-first-storage-cli.md)
+- [009-snapshot-first-storage](rfcs/009-snapshot-first-storage.md)
+- [009-snapshot-first-storage-pitr](rfcs/009-snapshot-first-storage-pitr.md)
+- [010-storage_details](rfcs/010-storage_details.md)
+- [011-retention-policy](rfcs/011-retention-policy.md)
+- [012-background-tasks](rfcs/012-background-tasks.md)
+- [013-term-history](rfcs/013-term-history.md)
+- [014-safekeepers-gossip](rfcs/014-safekeepers-gossip.md)
+- [014-storage-lsm](rfcs/014-storage-lsm.md)
+- [015-storage-messaging](rfcs/015-storage-messaging.md)
+- [016-connection-routing](rfcs/016-connection-routing.md)
+- [cluster-size-limits](rfcs/cluster-size-limits.md)
--- a/docs/book.toml
+++ b/docs/book.toml
@@ -0,0 +1,5 @@
+[book]
+language = "en"
+multilingual = false
+src = "."
+title = "Neon architecture"
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -1,202 +1,519 @@
-1. Add t_cid to XLOG record
- Why?
-  The cmin/cmax on a heap page is a real bummer. I don't see any other way to fix that than bite the bullet and modify the WAL-logging routine to include the cmin/cmax.
+# Postgres core changes

-  To recap, the problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares abut it anymore.
+This lists all the changes that have been made to the PostgreSQL
+source tree, as a somewhat logical set of patches. The long-term goal
+is to eliminate all these changes, by submitting patches to upstream
+and refactoring code into extensions, so that you can run unmodified
+PostgreSQL against Neon storage.

- Alternatives?
-  I don't know
+In Neon, we run PostgreSQL in the compute nodes, but we also run a special WAL redo process in the
+page server. We currently use the same binary for both, with --wal-redo runtime flag to launch it in
+the WAL redo mode. Some PostgreSQL changes are needed in the compute node, while others are just for
+the WAL redo process.

-2. Add PD_WAL_LOGGED.
- Why?
-  Postgres sometimes writes data to the page before it is wal-logged. If such page ais swapped out, we  will loose this change. The problem is currently solved by setting PD_WAL_LOGGED bit in page header. When page without this bit set is written to the SMGR, then it is forced to be written to the WAL as FPI using log_newpage_copy() function.
+In addition to core PostgreSQL changes, there is a Neon extension in contrib/neon, to hook into the
+smgr interface. Once all the core changes have been submitted to upstream or eliminated some other
+way, the extension could live outside the postgres repository and build against vanilla PostgreSQL.

-  There was wrong assumption that it can happen only during construction of some exotic indexes (like gist). It is not true. The same situation can happen with COPY,VACUUM and when record hint bits are set.
+Below is a list of all the PostgreSQL source code changes, categorized into changes needed for
+compute, and changes needed for the WAL redo process:

- Discussion:
-  https://discord.com/channels/869525774699462656/882681420986851359
+# Changes for Compute node

- Alternatives:
-  Do not store this flag in page header, but associate this bit with shared buffer. Logically it is more correct but in practice we will get not advantages: neither in space, neither in CPU overhead.
+## Add t_cid to heap WAL records
+
+```
+ src/backend/access/heap/heapam.c                            |   26 +-
+ src/include/access/heapam_xlog.h                            |    6 +-
+```
+
+We have added a new t_cid field to heap WAL records. This changes the WAL record format, making Neon WAL format incompatible with vanilla PostgreSQL!
+
+### Problem we're trying to solve
+
+The problem is that the XLOG_HEAP_INSERT record does not include the command id of the inserted row. And same with deletion/update. So in the primary, a row is inserted with current xmin + cmin. But in the replica, the cmin is always set to 1. That works in PostgreSQL, because the command id is only relevant to the inserting transaction itself. After commit/abort, no one cares about it anymore. But with Neon, we rely on WAL replay to reconstruct the page, even while the original transaction is still running.
+
+### How to get rid of the patch
+
+Bite the bullet and submit the patch to PostgreSQL, to add the t_cid to the WAL records. It makes the WAL records larger, which could make this unpopular in the PostgreSQL community. However, it might simplify some logical decoding code; Andres Freund briefly mentioned in PGCon 2022 discussion on Heikki's Neon presentation that logical decoding currently needs to jump through some hoops to reconstruct the same information.


-3. XLogReadBufferForRedo not always loads and pins requested buffer. So we need to add extra checks that buffer is really pinned. Also do not use BufferGetBlockNumber for buffer returned by XLogReadBufferForRedo.
- Why?
-  XLogReadBufferForRedo is not pinning pages which are not requested by wal-redo. It is specific only for wal-redo Postgres.
+### Alternatives
+Perhaps we could write an extra WAL record with the t_cid information, when a page is evicted that contains rows that were touched a transaction that's still running. However, that seems very complicated.

- Alternatives?
-  No
+## ginfast.c
+
+```
+diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c
+index e0d9940946..2d964c02e9 100644
+--- a/src/backend/access/gin/ginfast.c
+++ b/src/backend/access/gin/ginfast.c
+@@ -285,6 +285,17 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
+                memset(&sublist, 0, sizeof(GinMetaPageData));
+                makeSublist(index, collector->tuples, collector->ntuples, &sublist);
+ 
+               if (metadata->head != InvalidBlockNumber)
+               {
+                       /*
+                        * ZENITH: Get buffer before XLogBeginInsert() to avoid recursive call
+                        * of XLogBeginInsert(). Reading a new buffer might evict a dirty page from
+                        * the buffer cache, and if that page happens to be an FSM or VM page, zenith_write()
+                        * will try to WAL-log an image of the page.
+                        */
+                       buffer = ReadBuffer(index, metadata->tail);
+               }
+
+                if (needWal)
+                        XLogBeginInsert();
+ 
+@@ -316,7 +327,6 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
+                        data.prevTail = metadata->tail;
+                        data.newRightlink = sublist.head;
+ 
+-                       buffer = ReadBuffer(index, metadata->tail);
+                        LockBuffer(buffer, GIN_EXCLUSIVE);
+                        page = BufferGetPage(buffer);
+```
+
+The problem is explained in the comment above
+
+### How to get rid of the patch
+
+Can we stop WAL-logging FSM or VM pages? Or delay the WAL logging until we're out of the critical
+section or something.
+
+Maybe some bigger rewrite of FSM and VM would help to avoid WAL-logging FSM and VM page images?


-4. Eliminate reporting of some warnings related with hint bits, for example
-"page is not marked all-visible but visibility map bit is set in relation".
- Why?
-  Hint bit may be not WAL logged.
+## Mark index builds that use buffer manager without logging explicitly

- Alternative?
-  Always wal log any page changes.
+```
+ src/backend/access/gin/gininsert.c                          |    7 +
+ src/backend/access/gist/gistbuild.c                         |   15 +-
+ src/backend/access/spgist/spginsert.c                       |    8 +-
+
+also some changes in src/backend/storage/smgr/smgr.c
+```
+
+When a GIN index is built, for example, it is built by inserting the entries into the index more or
+less normally, but without WAL-logging anything. After the index has been built, we iterate through
+all pages and write them to the WAL. That doesn't work for Neon, because if a page is not WAL-logged
+and is evicted from the buffer cache, it is lost. We have an check to catch that in the Neon
+extension. To fix that, we've added a few functions to track explicitly when we're performing such
+an operation: `smgr_start_unlogged_build`, `smgr_finish_unlogged_build_phase_1` and
+`smgr_end_unlogged_build`.


-5. Maintain last written LSN.
- Why?
-  When compute node requests page from page server, we need to specify LSN. Ideally it should be LSN
-  of WAL record performing last update of this pages. But we do not know it, because we do not have page.
-  We can use current WAL flush position, but in this case there is high probability that page server
-  will be blocked until this peace of WAL is delivered.
-  As better approximation we can keep max LSN of written page. It will be better to take in account LSNs only of evicted pages,
-  but SMGR API doesn't provide such knowledge.
+### How to get rid of the patch

- Alternatives?
-  Maintain map of LSNs of evicted pages.
+I think it would make sense to be more explicit about that in PostgreSQL too. So extract these
+changes to a patch and post to pgsql-hackers.


-6. Launching Postgres without WAL.
- Why?
-  According to Zenith architecture compute node is stateless. So when we are launching
-  compute node, we need to provide some dummy PG_DATADIR. Relation pages
-  can be requested on demand from page server. But Postgres still need some non-relational data:
-  control and configuration files, SLRUs,...
-  It is currently implemented  using basebackup (do not mix with pg_basebackup) which is created
-  by pageserver. It includes in this tarball config/control files, SLRUs and required directories.
-  As far as pageserver do not have original (non-scattered) WAL segments, it includes in
-  this tarball dummy WAL segment which contains only SHUTDOWN_CHECKPOINT record at the beginning of segment,
-  which redo field points to the end of wal. It allows to load checkpoint record in more or less
-  standard way with minimal changes of Postgres, but then some special handling is needed,
-  including restoring previous record position from zenith.signal file.
-  Also we have to correctly initialize header of last WAL page (pointed by checkpoint.redo)
-  to pass checks performed by XLogReader.
+## Track last-written page LSN

- Alternatives?
-  We may not include fake WAL segment in tarball at all and modify xlog.c to load checkpoint record
-  in special way. But it may only increase number of changes in xlog.c
+```
+ src/backend/commands/dbcommands.c                           |   17 +-

-7. Add redo_read_buffer_filter callback to XLogReadBufferForRedoExtended
- Why?
-  We need a way in wal-redo Postgres to ignore pages which are not requested by pageserver.
-  So wal-redo Postgres reconstructs only requested page and for all other returns BLK_DONE
-  which means that recovery for them is not needed.
+Also one call to SetLastWrittenPageLSN() in spginsert.c, maybe elsewhere too
+```

- Alternatives?
-  No
+Whenever a page is evicted from the buffer cache, we remember its LSN, so that we can use the same
+LSN in the GetPage@LSN request when reading the page back from the page server. The value is
+conservative: it would be correct to always use the last-inserted LSN, but it would be slow because
+then the page server would need to wait for the recent WAL to be streamed and processed, before
+responding to any GetPage@LSN request.

-8. Enforce WAL logging of sequence updates.
- Why?
-  Due to performance reasons Postgres don't want to log each fetching of a value from a sequence,
-  so we pre-log a few fetches in advance. In the event of crash we can lose
-  (skip over) as many values as we pre-logged.
-  But it doesn't work with Zenith because page with sequence value can be evicted from buffer cache
-  and we will get a gap in sequence values even without crash.
+The last-written page LSN is mostly tracked in the smgrwrite() function, without core code changes,
+but there are a few exceptions where we've had to add explicit calls to the Neon-specific
+SetLastWrittenPageLSN() function.

- Alternatives:
-  Do not try to preserve sequential order but avoid performance penalty.
+There's an open PR to track the LSN in a more-fine grained fashion:
+https://github.com/neondatabase/postgres/pull/177
+
+PostgreSQL v15 introduces a new method to do CREATE DATABASE that WAL-logs the database instead of
+relying copying files and checkpoint. With that method, we probably won't need any special handling.
+The old method is still available, though.
+
+### How to get rid of the patch
+
+Wait until v15?


-9. Treat unlogged tables as normal (permanent) tables.
- Why?
-  Unlogged tables are not transient, so them have to survive node restart (unlike temporary tables).
-  But as far as compute node is stateless, we need to persist their data to storage node.
-  And it can only be done through the WAL.
+## Cache relation sizes

- Alternatives?
-  * Store unlogged tables locally (violates requirement of stateless compute nodes).
-  * Prohibit unlogged tables at all.
+The Neon extension contains a little cache for smgrnblocks() and smgrexists() calls, to avoid going
+to the page server every time. It might be useful to cache those in PostgreSQL, maybe in the
+relcache? (I think we do cache nblocks in relcache already, check why that's not good enough for
+Neon)


-10. Support start Postgres in wal-redo mode
- Why?
-  To be able to apply WAL record and reconstruct pages at page server.
+## Misc change in vacuumlazy.c

- Alternatives?
-  * Rewrite redo handlers in Rust
-  * Do not reconstruct pages at page server at all and do it at compute node.
+```
+index 8aab6e324e..c684c4fbee 100644
+--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
+@@ -1487,7 +1487,10 @@ lazy_scan_heap(LVRelState *vacrel, VacuumParams *params, bool aggressive)
+                else if (all_visible_according_to_vm && !PageIsAllVisible(page)
+                                 && VM_ALL_VISIBLE(vacrel->rel, blkno, &vmbuffer))
+                {
+-                       elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+                       /* ZENITH-XXX: all visible hint is not wal-logged
+                        * FIXME: Replay visibilitymap changes in pageserver
+                        */
+                       elog(DEBUG1, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
+                                 vacrel->relname, blkno);
+                        visibilitymap_clear(vacrel->rel, blkno, vmbuffer,
+                                                                VISIBILITYMAP_VALID_BITS);
+```


-11. WAL proposer
- Why?
-  WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.
-  It is currently implemented as patch to standard WAL sender.
-
- Alternatives?
-  Can be moved to extension if some extra callbacks will be added to wal sender code.
+Is this still needed? If that WARNING happens, it looks like potential corruption that we should
+fix!


-12. Secure Computing BPF API wrapper.
- Why?
-  Pageserver delegates complex WAL decoding duties to Postgres,
-  which means that the latter might fall victim to carefully designed
-  malicious WAL records and start doing harmful things to the system.
-  To prevent this, it has been decided to limit possible interactions
-  with the outside world using the Secure Computing BPF mode.
+## Use buffer manager when extending VM or FSM

- Alternatives:
-  * Rewrite redo handlers in Rust.
-  * Add more checks to guarantee correctness of WAL records.
-  * Move seccomp.c to extension
-  * Many other discussed approaches to neutralize incorrect WAL records vulnerabilities.
+```
+ src/backend/storage/freespace/freespace.c                   |   14 +-
+ src/backend/access/heap/visibilitymap.c                     |   15 +-
+
+diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
+index e198df65d8..addfe93eac 100644
+--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
+@@ -652,10 +652,19 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
+        /* Now extend the file */
+        while (vm_nblocks_now < vm_nblocks)
+        {
+-               PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
+               /*
+                * ZENITH: Initialize VM pages through buffer cache to prevent loading
+                * them from pageserver.
+                */
+               Buffer  buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW,
+                                                                                       RBM_ZERO_AND_LOCK, NULL);
+               Page    page = BufferGetPage(buffer);
+
+               PageInit((Page) page, BLCKSZ, 0);
+               PageSetChecksumInplace(page, vm_nblocks_now);
+               MarkBufferDirty(buffer);
+               UnlockReleaseBuffer(buffer);
+ 
+-               smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
+-                                  pg.data, false);
+                vm_nblocks_now++;
+        }
+```
+
+### Problem we're trying to solve
+
+???
+
+### How to get rid of the patch
+
+Maybe this would be a reasonable change in PostgreSQL too?


-13. Callbacks for replica feedbacks
- Why?
-  Allowing waproposer to interact with walsender code.
+## Allow startup without reading checkpoint record

- Alternatives
-  Copy walsender code to walproposer.
+In Neon, the compute node is stateless. So when we are launching compute node, we need to provide
+some dummy PG_DATADIR. Relation pages can be requested on demand from page server. But Postgres
+still need some non-relational data: control and configuration files, SLRUs,...  It is currently
+implemented using basebackup (do not mix with pg_basebackup) which is created by pageserver. It
+includes in this tarball config/control files, SLRUs and required directories.
+
+As pageserver does not have the original WAL segments, the basebackup tarball includes an empty WAL
+segment to bootstrap the WAL writing, but it doesn't contain the checkpoint record.  There are some
+changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
+from WAL.
+
+This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
+at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
+checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.


-14. Support multiple SMGR implementations.
- Why?
-  Postgres provides abstract API for storage manager but it has only one implementation
-  and provides no way to replace it with custom storage manager.
+### How to get rid of the patch

- Alternatives?
-  None.
+???


-15. Calculate database size as sum of all database relations.
- Why?
-  Postgres is calculating database size by traversing data directory
-  but as far as Zenith compute node is stateless we can not do it.
+### Alternatives

- Alternatives?
-  Send this request directly to pageserver and calculate real (physical) size
-  of Zenith representation of database/timeline, rather than sum logical size of all relations.
+Include a fake checkpoint record in the tarball. Creating fake WAL is a bit risky, though; I'm
+afraid it might accidentally get streamed to the safekeepers and overwrite or corrupt the real WAL.
+
+## Disable sequence caching
+
+```
+diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
+index 0415df9ccb..9f9db3c8bc 100644
+--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
+@@ -53,7 +53,9 @@
+  * so we pre-log a few fetches in advance. In the event of
+  * crash we can lose (skip over) as many values as we pre-logged.
+  */
+-#define SEQ_LOG_VALS   32
+/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */
+/* #define SEQ_LOG_VALS        32 */
+#define SEQ_LOG_VALS   0
+```
+
+Due to performance reasons Postgres don't want to log each fetching of a value from a sequence, so
+it pre-logs a few fetches in advance. In the event of crash we can lose (skip over) as many values
+as we pre-logged. But with Neon, because page with sequence value can be evicted from buffer cache,
+we can get a gap in sequence values even without crash.
+
+### How to get rid of the patch
+
+Maybe we can just remove it, and accept the gaps. Or add some special handling for sequence
+relations in the Neon extension, to WAL log the sequence page when it's about to be evicted. It
+would be weird if the sequence moved backwards though, think of PITR.
+
+Or add a GUC for the amount to prefix to PostgreSQL, and force it to 1 in Neon.


-----------------------------------------------
-Not currently committed but proposed:
+## Walproposer

-1. Disable ring buffer buffer manager strategies
- Why?
-  Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
-  Even if there are free space in buffer cache, pages may be evicted.
-  Negative effect of it can be somehow compensated by file system cache, but in case of Zenith
-  cost of requesting page from page server is much higher.
+```
+ src/Makefile                                                |    1 +
+ src/backend/replication/libpqwalproposer/Makefile           |   37 +
+ src/backend/replication/libpqwalproposer/libpqwalproposer.c |  416 ++++++++++++
+ src/backend/postmaster/bgworker.c                           |    4 +
+ src/backend/postmaster/postmaster.c                         |    6 +
+ src/backend/replication/Makefile                            |    4 +-
+ src/backend/replication/walproposer.c                       | 2350 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+ src/backend/replication/walproposer_utils.c                 |  402 +++++++++++
+ src/backend/replication/walreceiver.c                       |    7 +
+ src/backend/replication/walsender.c                         |  320 ++++++---
+ src/backend/storage/ipc/ipci.c                              |    6 +
+ src/include/replication/walproposer.h                       |  565 ++++++++++++++++
+```

- Alternatives?
-  Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
-  for example copy evicted page from ring buffer to some other buffer if there is free space
-  in buffer cache.
+WAL proposer is communicating with safekeeper and ensures WAL durability by quorum writes.  It is
+currently implemented as patch to standard WAL sender.

-2. Disable marking page as dirty when hint bits are set.
- Why?
-  Postgres has to modify page twice: first time when some tuple is updated and second time when
-  hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
+### How to get rid of the patch

- Alternatives?
-  Add special WAL record for setting page hints.
+Refactor into an extension. Submit hooks or APIs into upstream if necessary.

-3. Prefetching
- Why?
-  As far as pages in Zenith are loaded on demand, to reduce node startup time
-  and also sppedup some massive queries we need some mechanism for bulk loading to
-  reduce page request round-trip overhead.
+@MMeent did some work on this already: https://github.com/neondatabase/postgres/pull/96

-  Currently Postgres is supporting prefetching only for bitmap scan.
-  In Zenith we also use prefetch for sequential and index scan. For sequential scan we prefetch
-  some number of following pages. For index scan we prefetch pages of heap relation addressed by TIDs.
+## Ignore unexpected data beyond EOF in bufmgr.c

-4. Prewarming.
- Why?
-  Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
-  But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
-  We can capture state of compute node buffer cache and send bulk request for this pages at startup.
+```
+@@ -922,11 +928,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+                 */
+                bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+                if (!PageIsNew((Page) bufBlock))
+-                       ereport(ERROR,
+               {
+                        // XXX-ZENITH
+                        MemSet((char *) bufBlock, 0, BLCKSZ);
+                        ereport(DEBUG1,
+                                        (errmsg("unexpected data beyond EOF in block %u of relation %s",
+                                                        blockNum, relpath(smgr->smgr_rnode, forkNum)),
+                                         errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
+-
+               }
+                /*
+                 * We *must* do smgrextend before succeeding, else the page will not
+                 * be reserved by the kernel, and the next P_NEW call will decide to
+```
+
+PostgreSQL is a bit sloppy with extending relations. Usually, the relation is extended with zeros
+first, then the page is filled, and finally the new page WAL-logged. But if multiple backends extend
+a relation at the same time, the pages can be WAL-logged in different order.
+
+I'm not sure what scenario exactly required this change in Neon, though.
+
+### How to get rid of the patch
+
+Submit patches to pgsql-hackers, to tighten up the WAL-logging around relation extension. It's a bit
+confusing even in PostgreSQL. Maybe WAL log the intention to extend first, then extend the relation,
+and finally WAL-log that the extension succeeded.
+
+## Make smgr interface available to extensions
+
+```
+ src/backend/storage/smgr/smgr.c                             |  203 +++---
+ src/include/storage/smgr.h                                  |   72 +-
+```
+
+### How to get rid of the patch
+
+Submit to upstream. This could be useful for the Disk Encryption patches too, or for compression.
+
+
+## Added relpersistence argument to smgropen()
+
+```
+ src/backend/access/heap/heapam_handler.c                    |    2 +-
+ src/backend/catalog/storage.c                               |   10 +-
+ src/backend/commands/tablecmds.c                            |    2 +-
+ src/backend/storage/smgr/md.c                               |    4 +-
+ src/include/utils/rel.h                                     |    3 +-
+```
+
+Neon needs to treat unlogged relations differently from others, so the smgrread(), smgrwrite() etc.
+implementations need to know the 'relpersistence' of the relation. To get that information where
+it's needed, we added the 'relpersistence' field to smgropen().
+
+### How to get rid of the patch
+
+Maybe 'relpersistence' would be useful in PostgreSQL for debugging purposes? Or simply for the
+benefit of extensions like Neon. Should consider this in the patch to make smgr API usable to
+extensions.
+
+## Alternatives
+
+Currently in Neon, unlogged tables live on local disk in the compute node, and are wiped away on
+compute node restart. One alternative would be to instead WAL-log even unlogged tables, essentially
+ignoring the UNLOGGED option. Or prohibit UNLOGGED tables completely. But would we still need the
+relpersistence argument to handle index builds? See item on "Mark index builds that use buffer
+manager without logging explicitly".
+
+## Use smgr and dbsize_hook for size calculations
+
+```
+ src/backend/utils/adt/dbsize.c                              |   61 +-
+```
+
+In PostgreSQL, the rel and db-size functions scan the data directory directly. That won't work in Neon.
+
+### How to get rid of the patch
+
+Send patch to PostgreSQL, to use smgr API functions for relation size calculation instead. Maybe as
+part of the general smgr API patch.
+
+
+
+# WAL redo process changes
+
+Pageserver delegates complex WAL decoding duties to Postgres, which means that the latter might fall
+victim to carefully designed malicious WAL records and start doing harmful things to the system.  To
+prevent this, the redo functions are executed in a separate process that is sandboxed with Linux
+Secure Computing mode (see seccomp(2) man page).
+
+As an alternative to having a separate WAL redo process, we could rewrite all redo handlers in Rust
+This is infeasible. However, it would take a lot of effort to rewrite them, ensure that you've done
+the rewrite correctly, and once you've done that, it would be a lot of ongoing maintenance effort to
+keep the rewritten code in sync over time, across new PostgreSQL versions. That's why we want to
+leverage PostgreSQL code.
+
+Another alternative would be to harden all the PostgreSQL WAL redo functions so that it would be
+safe to call them directly from Rust code, without needing the security sandbox. That's not feasible
+for similar reasons as rewriting them in Rust.
+
+
+## Don't replay change in XLogReadBufferForRedo that are not for the target page we're replaying
+
+```
+ src/backend/access/gin/ginxlog.c                            |   19 +-
+
+Also some changes in xlog.c and xlogutils.c
+
+Example:
+
+@@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record)
+        if (!isLeaf)
+                ginRedoClearIncompleteSplit(record, 3);
+ 
+-       if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED)
+       action = XLogReadBufferForRedo(record, 0, &lbuffer);
+       if (action != BLK_RESTORED && action != BLK_DONE)
+                elog(ERROR, "GIN split record did not contain a full-page image of left page");
+```
+
+### Problem we're trying to solve
+
+In PostgreSQL, if a WAL redo function calls XLogReadBufferForRead() for a page that has a full-page
+image, it always succeeds. However, Neon WAL redo process is only concerned about replaying changes
+to a singe page, so replaying any changes for other pages is a waste of cycles. We have modified
+XLogReadBufferForRead() to return BLK_DONE for all other pages, to avoid the overhead. That is
+unexpected by code like the above.
+
+### How to get rid of the patch
+
+Submit the changes to upstream, hope the community accepts them. There's no harm to PostgreSQL from
+these changes, although it doesn't have any benefit either.
+
+To make these changes useful to upstream PostgreSQL, we could implement a feature to look ahead the
+WAL, and detect truncated relations. Even in PostgreSQL, it is a waste of cycles to replay changes
+to pages that are later truncated away, so we could have XLogReadBufferForRedo() return BLK_DONE or
+BLK_NOTFOUND for pages that are known to be truncated away later in the WAL stream.
+
+### Alternatives
+
+Maybe we could revert this optimization, and restore pages other than the target page too.
+
+## Add predefined_sysidentifier flag to initdb
+
+```
+ src/backend/bootstrap/bootstrap.c                           |   13 +-
+ src/bin/initdb/initdb.c                                     |    4 +
+
+And some changes in xlog.c
+```
+
+This is used to help with restoring a database when you have all the WAL, all the way back to
+initdb, but no backup. You can reconstruct the missing backup by running initdb again, with the same
+sysidentifier.
+
+
+### How to get rid of the patch
+
+Ignore it. This is only needed for disaster recovery, so once we've eliminated all other Postgres
+patches, we can just keep it around as a patch or as separate branch in a repo.
+
+
+# Not currently committed but proposed
+
+## Disable ring buffer buffer manager strategies
+
+### Why?
+
+Postgres tries to avoid cache flushing by bulk operations (copy, seqscan, vacuum,...).
+Even if there are free space in buffer cache, pages may be evicted.
+Negative effect of it can be somehow compensated by file system cache, but in Neon,
+cost of requesting page from page server is much higher.
+
+### Alternatives?
+
+Instead of just prohibiting ring buffer we may try to implement more flexible eviction policy,
+for example copy evicted page from ring buffer to some other buffer if there is free space
+in buffer cache.
+
+## Disable marking page as dirty when hint bits are set.
+
+### Why?
+
+Postgres has to modify page twice: first time when some tuple is updated and second time when
+hint bits are set. Wal logging hint bits updates requires FPI which significantly increase size of WAL.
+
+### Alternatives?
+
+Add special WAL record for setting page hints.
+
+## Prefetching
+
+### Why?
+
+As far as pages in Neon are loaded on demand, to reduce node startup time
+and also speedup some massive queries we need some mechanism for bulk loading to
+reduce page request round-trip overhead.
+
+Currently Postgres is supporting prefetching only for bitmap scan.
+In Neon we should also use prefetch for sequential and index scans, because the OS is not doing it for us.
+For sequential scan we could prefetch some number of following pages. For index scan we could prefetch pages
+of heap relation addressed by TIDs.
+
+## Prewarming
+
+### Why?
+
+Short downtime (or, in other words, fast compute node restart time) is one of the key feature of Zenith.
+But overhead of request-response round-trip for loading pages on demand can make started node warm-up quite slow.
+We can capture state of compute node buffer cache and send bulk request for this pages at startup.
--- a/docs/glossary.md
+++ b/docs/glossary.md
@@ -75,7 +75,7 @@ layer's Segment and range of LSNs.
 There are two kinds of layers, in-memory and on-disk layers. In-memory
 layers are used to ingest incoming WAL, and provide fast access
 to the recent page versions. On-disk layers are stored as files on disk, and
-are immutable. See pageserver/src/layered_repository/README.md for more.
+are immutable. See [pageserver-storage.md](./pageserver-storage.md) for more.

 ### Layer file (on-disk layer)

@@ -111,7 +111,7 @@ PostgreSQL LSNs and functions to monitor them:
 * `pg_last_wal_replay_lsn ()` - Returns the last write-ahead log location that has been replayed during recovery. If recovery is still in progress this will increase monotonically.
 [source PostgreSQL documentation](https://www.postgresql.org/docs/devel/functions-admin.html):

-Neon safekeeper LSNs. For more check [safekeeper/README_PROTO.md](/safekeeper/README_PROTO.md)
+Neon safekeeper LSNs. See [safekeeper protocol section](safekeeper-protocol.md) for more information.
 * `CommitLSN`: position in WAL confirmed by quorum safekeepers.
 * `RestartLSN`: position in WAL confirmed by all safekeepers.
 * `FlushLSN`: part of WAL persisted to the disk by safekeeper.
--- a/docs/pageserver-page-service.md
+++ b/docs/pageserver-page-service.md
@@ -0,0 +1,9 @@
+# Page Service
+
+The Page Service listens for GetPage@LSN requests from the Compute Nodes,
+and responds with pages from the repository. On each GetPage@LSN request,
+it calls into the Repository function
+
+A separate thread is spawned for each incoming connection to the page
+service. The page service uses the libpq protocol to communicate with
+the client. The client is a Compute Postgres instance.
--- a/docs/pageserver-pagecache.md
+++ b/docs/pageserver-pagecache.md
@@ -0,0 +1,8 @@
+# Page cache
+
+TODO:
+
+- shared across tenants
+- store pages from layer files
+- store pages from "in-memory layer"
+- store materialized pages
--- a/docs/pageserver-processing-getpage.md
+++ b/docs/pageserver-processing-getpage.md
@@ -0,0 +1,4 @@
+# Processing a GetPage request
+
+TODO:
+- sequence diagram that shows how a GetPage@LSN request is processed
--- a/docs/pageserver-processing-wal.md
+++ b/docs/pageserver-processing-wal.md
@@ -0,0 +1,5 @@
+# Processing WAL
+
+TODO:
+- diagram that shows how incoming WAL is processed
+- explain durability, what is fsync'd when, disk_consistent_lsn
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -1,15 +1,4 @@
-## Page server architecture
-
-The Page Server has a few different duties:
-
- Respond to GetPage@LSN requests from the Compute Nodes
- Receive WAL from WAL safekeeper
- Replay WAL that's applicable to the chunks that the Page Server maintains
- Backup to S3
-
-S3 is the main fault-tolerant storage of all data, as there are no Page Server
-replicas. We use a separate fault-tolerant WAL service to reduce latency. It
-keeps track of WAL records which are not synced to S3 yet.
+# Services

 The Page Server consists of multiple threads that operate on a shared
 repository of page versions:
@@ -21,18 +10,22 @@ repository of page versions:
                                   | WAL receiver |
                                   |              |
                                   +--------------+
-                                                                                 +----+
-                  +---------+                              ..........            |    |
-                  |         |                              .        .            |    |
- GetPage@LSN      |         |                              . backup .  ------->  | S3 |
------------->    |  Page   |         repository           .        .            |    |
-                  | Service |                              ..........            |    |
-   page           |         |                                                    +----+
+                                                                                 ......
+                  +---------+                              +--------+            .    .
+                  |         |                              |        |            .    .
+ GetPage@LSN      |         |                              | backup |  ------->  . S3 .
+------------->    |  Page   |         repository           |        |            .    .
+                  | Service |                              +--------+            .    .
+   page           |         |                                                    ......
 <-------------    |         |
-                  +---------+      +--------------------+
-		                   |   Checkpointing /  |
-				   | Garbage collection |
-                                   +--------------------+
+                  +---------+     +-----------+     +--------------------+
+                                  | WAL redo  |     | Checkpointing,     |
+                  +----------+    | processes |     | Garbage collection |
+                  |          |    +-----------+     +--------------------+
+                  |   HTTP   |
+                  | mgmt API |
+                  |          |
+                  +----------+

 Legend:

@@ -40,83 +33,29 @@ Legend:
 |  |   A thread or multi-threaded service
 +--+

-....
-.  .   Component at its early development phase.
-....
-
 --->   Data flow
 <---
 ```

-Page Service
------------
+## Page Service

 The Page Service listens for GetPage@LSN requests from the Compute Nodes,
-and responds with pages from the repository.
+and responds with pages from the repository. On each GetPage@LSN request,
+it calls into the Repository function
+
+A separate thread is spawned for each incoming connection to the page
+service. The page service uses the libpq protocol to communicate with
+the client. The client is a Compute Postgres instance.
+
+## WAL Receiver
+
+The WAL receiver connects to the external WAL safekeeping service
+using PostgreSQL physical streaming replication, and continuously
+receives WAL. It decodes the WAL records, and stores them to the
+repository.


-WAL Receiver
------------
-
-The WAL receiver connects to the external WAL safekeeping service (or
-directly to the primary) using PostgreSQL physical streaming
-replication, and continuously receives WAL. It decodes the WAL records,
-and stores them to the repository.
-
-
-Repository
----------
-
-The repository stores all the page versions, or WAL records needed to
-reconstruct them. Each tenant has a separate Repository, which is
-stored in the .zenith/tenants/<tenantid> directory.
-
-Repository is an abstract trait, defined in `repository.rs`. It is
-implemented by the LayeredRepository object in
-`layered_repository.rs`. There is only that one implementation of the
-Repository trait, but it's still a useful abstraction that keeps the
-interface for the low-level storage functionality clean. The layered
-storage format is described in layered_repository/README.md.
-
-Each repository consists of multiple Timelines. Timeline is a
-workhorse that accepts page changes from the WAL, and serves
-get_page_at_lsn() and get_rel_size() requests. Note: this has nothing
-to do with PostgreSQL WAL timeline. The term "timeline" is mostly
-interchangeable with "branch", there is a one-to-one mapping from
-branch to timeline. A timeline has a unique ID within the tenant,
-represented as 16-byte hex string that never changes, whereas a
-branch is a user-given name for a timeline.
-
-Each repository also has a WAL redo manager associated with it, see
-`walredo.rs`. The WAL redo manager is used to replay PostgreSQL WAL
-records, whenever we need to reconstruct a page version from WAL to
-satisfy a GetPage@LSN request, or to avoid accumulating too much WAL
-for a page. The WAL redo manager uses a Postgres process running in
-special zenith wal-redo mode to do the actual WAL redo, and
-communicates with the process using a pipe.
-
-
-Checkpointing / Garbage Collection
----------------------------------
-
-Periodically, the checkpointer thread wakes up and performs housekeeping
-duties on the repository. It has two duties:
-
-### Checkpointing
-
-Flush WAL that has accumulated in memory to disk, so that the old WAL
-can be truncated away in the WAL safekeepers. Also, to free up memory
-for receiving new WAL. This process is called "checkpointing". It's
-similar to checkpointing in PostgreSQL or other DBMSs, but in the page
-server, checkpointing happens on a per-segment basis.
-
-### Garbage collection
-
-Remove old on-disk layer files that are no longer needed according to the
-PITR retention policy
-
-
-### Backup service
+## Backup service

 The backup service, responsible for storing pageserver recovery data externally.

@@ -129,8 +68,6 @@ There are the following implementations present:
 * local filesystem — to use in tests mainly
 * AWS S3           - to use in production

-Implementation details are covered in the [backup readme](./src/remote_storage/README.md) and corresponding Rust file docs, parameters documentation can be found at [settings docs](../docs/settings.md).
-
 The backup service is disabled by default and can be enabled to interact with a single remote storage.

 CLI examples:
@@ -159,6 +96,67 @@ prefix_in_bucket = '/test_prefix/'

 `AWS_SECRET_ACCESS_KEY` and `AWS_ACCESS_KEY_ID` env variables can be used to specify the S3 credentials if needed.

+
+## Repository background tasks
+
+The Repository also has a few different background threads and tokio tasks that perform
+background duties like dumping accumulated WAL data from memory to disk, reorganizing
+files for performance (compaction), and garbage collecting old files.
+
+
+Repository
+----------
+
+The repository stores all the page versions, or WAL records needed to
+reconstruct them. Each tenant has a separate Repository, which is
+stored in the .neon/tenants/<tenantid> directory.
+
+Repository is an abstract trait, defined in `repository.rs`. It is
+implemented by the LayeredRepository object in
+`layered_repository.rs`. There is only that one implementation of the
+Repository trait, but it's still a useful abstraction that keeps the
+interface for the low-level storage functionality clean. The layered
+storage format is described in [pageserver-storage.md](./pageserver-storage.md).
+
+Each repository consists of multiple Timelines. Timeline is a
+workhorse that accepts page changes from the WAL, and serves
+get_page_at_lsn() and get_rel_size() requests. Note: this has nothing
+to do with PostgreSQL WAL timeline. The term "timeline" is mostly
+interchangeable with "branch", there is a one-to-one mapping from
+branch to timeline. A timeline has a unique ID within the tenant,
+represented as 16-byte hex string that never changes, whereas a
+branch is a user-given name for a timeline.
+
+Each repository also has a WAL redo manager associated with it, see
+`walredo.rs`. The WAL redo manager is used to replay PostgreSQL WAL
+records, whenever we need to reconstruct a page version from WAL to
+satisfy a GetPage@LSN request, or to avoid accumulating too much WAL
+for a page. The WAL redo manager uses a Postgres process running in
+special Neon wal-redo mode to do the actual WAL redo, and
+communicates with the process using a pipe.
+
+
+Checkpointing / Garbage Collection
+----------------------------------
+
+Periodically, the checkpointer thread wakes up and performs housekeeping
+duties on the repository. It has two duties:
+
+### Checkpointing
+
+Flush WAL that has accumulated in memory to disk, so that the old WAL
+can be truncated away in the WAL safekeepers. Also, to free up memory
+for receiving new WAL. This process is called "checkpointing". It's
+similar to checkpointing in PostgreSQL or other DBMSs, but in the page
+server, checkpointing happens on a per-segment basis.
+
+### Garbage collection
+
+Remove old on-disk layer files that are no longer needed according to the
+PITR retention policy
+
+
+
 TODO: Sharding
 --------------------

--- a/pageserver/src/layered_repository/README.md
+++ b/pageserver/src/layered_repository/README.md
@@ -1,4 +1,4 @@
-# Overview
+# Pageserver storage

 The main responsibility of the Page Server is to process the incoming WAL, and
 reprocess it into a format that allows reasonably quick access to any page
@@ -123,7 +123,7 @@ The files are called "layer files". Each layer file covers a range of keys, and
 a range of LSNs (or a single LSN, in case of image layers). You can think of it
 as a rectangle in the two-dimensional key-LSN space. The layer files for each
 timeline are stored in the timeline's subdirectory under
-`.zenith/tenants/<tenantid>/timelines`.
+`.neon/tenants/<tenantid>/timelines`.

 There are two kind of layer files: images, and delta layers. An image file
 contains a snapshot of all keys at a particular LSN, whereas a delta file
@@ -178,7 +178,7 @@ version, and how branching and GC works is still valid.
 The full path of a delta file looks like this:

 ```
-    .zenith/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
+    .neon/tenants/941ddc8604413b88b3d208bddf90396c/timelines/4af489b06af8eed9e27a841775616962/rel_1663_13990_2609_0_10_000000000169C348_0000000001702000
 ```

 For simplicity, the examples below use a simplified notation for the
@@ -409,7 +409,7 @@ removed because there is no newer layer file for the table.

 Things get slightly more complicated with multiple branches. All of
 the above still holds, but in addition to recent files we must also
-retain older shapshot files that are still needed by child branches.
+retain older snapshot files that are still needed by child branches.
 For example, if child branch is created at LSN 150, and the 'customers'
 table is updated on the branch, you would have these files:

--- a/docs/pageserver-thread-mgmt.md
+++ b/docs/pageserver-thread-mgmt.md
@@ -0,0 +1,26 @@
+## Thread management
+
+Each thread in the system is tracked by the `thread_mgr` module. It
+maintains a registry of threads, and which tenant or timeline they are
+operating on. This is used for safe shutdown of a tenant, or the whole
+system.
+
+### Handling shutdown
+
+When a tenant or timeline is deleted, we need to shut down all threads
+operating on it, before deleting the data on disk. A thread registered
+in the thread registry can check if it has been requested to shut down,
+by calling `is_shutdown_requested()`. For async operations, there's also
+a `shudown_watcher()` async task that can be used to wake up on shutdown.
+
+### Sync vs async
+
+The primary programming model in the page server is synchronous,
+blocking code. However, there are some places where async code is
+used. Be very careful when mixing sync and async code.
+
+Async is primarily used to wait for incoming data on network
+connections. For example, all WAL receivers have a shared thread pool,
+with one async Task for each connection. Once a piece of WAL has been
+received from the network, the thread calls the blocking functions in
+the Repository to process the WAL.
--- a/docs/pageserver-walredo.md
+++ b/docs/pageserver-walredo.md
@@ -0,0 +1,77 @@
+# WAL Redo
+
+To reconstruct a particular page version from an image of the page and
+some WAL records, the pageserver needs to replay the WAL records. This
+happens on-demand, when a GetPage@LSN request comes in, or as part of
+background jobs that reorganize data for faster access.
+
+It's important that data cannot leak from one tenant to another, and
+that a corrupt WAL record on one timeline doesn't affect other tenants
+or timelines.
+
+## Multi-tenant security
+
+If you have direct access to the WAL directory, or if you have
+superuser access to a running PostgreSQL server, it's easy to
+construct a malicious or corrupt WAL record that causes the WAL redo
+functions to crash, or to execute arbitrary code. That is not a
+security problem for PostgreSQL; if you have superuser access, you
+have full access to the system anyway.
+
+The Neon pageserver, however, is multi-tenant. It needs to execute WAL
+belonging to different tenants in the same system, and malicious WAL
+in one tenant must not affect other tenants.
+
+A separate WAL redo process is launched for each tenant, and the
+process uses the seccomp(2) system call to restrict its access to the
+bare minimum needed to replay WAL records. The process does not have
+access to the filesystem or network. It can only communicate with the
+parent pageserver process through a pipe.
+
+If an attacker creates a malicious WAL record and injects it into the
+WAL stream of a timeline, he can take control of the WAL redo process
+in the pageserver. However, the WAL redo process cannot access the
+rest of the system. And because there is a separate WAL redo process
+for each tenant, the hijacked WAL redo process can only see WAL and
+data belonging to the same tenant, which the attacker would have
+access to anyway.
+
+## WAL-redo process communication
+
+The WAL redo process runs the 'postgres' executable, launched with a
+Neon-specific command-line option to put it into WAL-redo process
+mode.  The pageserver controls the lifetime of the WAL redo processes,
+launching them as needed. If a tenant is detached from the pageserver,
+any WAL redo processes for that tenant are killed.
+
+The pageserver communicates with each WAL redo process over its
+stdin/stdout/stderr. It works in request-response model with a simple
+custom protocol, described in walredo.rs. To replay a set of WAL
+records for a page, the pageserver sends the "before" image of the
+page and the WAL records over 'stdin', followed by a command to
+perform the replay. The WAL redo process responds with an "after"
+image of the page.
+
+## Special handling of some records
+
+Some WAL record types are handled directly in the pageserver, by
+bespoken Rust code, and are not sent over to the WAL redo process.
+This includes SLRU-related WAL records, like commit records. SLRUs
+don't use the standard Postgres buffer manager, so dealing with them
+in the Neon WAL redo mode would require quite a few changes to
+Postgres code and special handling in the protocol anyway.
+
+Some record types that include a full-page-image (e.g. XLOG_FPI) are
+also handled specially when incoming WAL is processed already, and are
+stored as page images rather than WAL records.
+
+
+## Records that modify multiple pages
+
+Some Postgres WAL records modify multiple pages. Such WAL records are
+duplicated, so that a copy is stored for each affected page. This is
+somewhat wasteful, but because most WAL records only affect one page,
+the overhead is acceptable.
+
+The WAL redo always happens for one particular page. If the WAL record
+coantains changes to other pages, they are ignored.
--- a/docs/pageserver.md
+++ b/docs/pageserver.md
@@ -0,0 +1,11 @@
+# Page server architecture
+
+The Page Server has a few different duties:
+
+- Respond to GetPage@LSN requests from the Compute Nodes
+- Receive WAL from WAL safekeeper, and store it
+- Upload data to S3 to make it durable, download files from S3 as needed
+
+S3 is the main fault-tolerant storage of all data, as there are no Page Server
+replicas. We use a separate fault-tolerant WAL service to reduce latency. It
+keeps track of WAL records which are not synced to S3 yet.
--- a/docs/rfcs/002-storage.md
+++ b/docs/rfcs/002-storage.md
@@ -77,7 +77,7 @@ Upon storage node restart recent WAL files are applied to appropriate pages and

 ### **Checkpointing**

-No such mechanism is needed. Or we may look at the storage node as at kind of continuous chekpointer.
+No such mechanism is needed. Or we may look at the storage node as at kind of continuous checkpointer.

 ### **Full page writes (torn page protection)**

--- a/docs/rfcs/cluster-size-limits.md
+++ b/docs/rfcs/cluster-size-limits.md
@@ -36,12 +36,12 @@ This is how the `LOGICAL_TIMELINE_SIZE` metric is implemented in the pageserver.
 Alternatively, we could count only relation data. As in pg_database_size().
 This approach is somewhat more user-friendly because it is the data that is really affected by the user.
 On the other hand, it puts us in a weaker position than other services, i.e., RDS.
-We will need to refactor the timeline_size counter or add another counter to implement it. 
+We will need to refactor the timeline_size counter or add another counter to implement it.

 Timeline size is updated during wal digestion. It is not versioned and is valid at the last_received_lsn moment.
 Then this size should be reported to compute node.

-`current_timeline_size` value is included in the walreceiver's custom feedback message: `ZenithFeedback.`
+`current_timeline_size` value is included in the walreceiver's custom feedback message: `ReplicationFeedback.`

 (PR about protocol changes https://github.com/zenithdb/zenith/pull/1037).

@@ -64,11 +64,11 @@ We should warn users if the limit is soon to be reached.
 ### **Reliability, failure modes and corner cases**

 1. `current_timeline_size` is valid at the last received and digested by pageserver lsn.
-    
+
    If pageserver lags behind compute node, `current_timeline_size` will lag too. This lag can be tuned using backpressure, but it is not expected to be 0 all the time.
-    
+
    So transactions that happen in this lsn range may cause limit overflow. Especially operations that generate (i.e., CREATE DATABASE) or free (i.e., TRUNCATE) a lot of data pages while generating a small amount of WAL. Are there other operations like this?
-    
+
    Currently, CREATE DATABASE operations are restricted in the console. So this is not an issue.


--- a/docs/safekeeper-protocol.md
+++ b/docs/safekeeper-protocol.md
--- a/docs/separation-compute-storage.md
+++ b/docs/separation-compute-storage.md
@@ -0,0 +1,8 @@
+# Separation of Compute and Storage
+
+TODO:
+
+- Read path
+- Write path
+- Durability model
+- API auth
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -15,7 +15,7 @@ listen_pg_addr = '127.0.0.1:64000'
 listen_http_addr = '127.0.0.1:9898'

 checkpoint_distance = '268435456' # in bytes
-checkpoint_period = '1 s'
+checkpoint_timeout = '10m'

 gc_period = '100 s'
 gc_horizon = '67108864'
@@ -46,7 +46,7 @@ Note the `[remote_storage]` section: it's a [table](https://toml.io/en/v1.0.0#ta

 All values can be passed as an argument to the pageserver binary, using the `-c` parameter and specified as a valid TOML string. All tables should be passed in the inline form.

-Example: `${PAGESERVER_BIN} -c "checkpoint_period = '100 s'" -c "remote_storage={local_path='/some/local/path/'}"`
+Example: `${PAGESERVER_BIN} -c "checkpoint_timeout = '10 m'" -c "remote_storage={local_path='/some/local/path/'}"`

 Note that TOML distinguishes between strings and integers, the former require single or double quotes around them.

@@ -82,6 +82,14 @@ S3.

 The unit is # of bytes.

+#### checkpoint_timeout
+
+Apart from `checkpoint_distance`, open layer flushing is also triggered
+`checkpoint_timeout` after the last flush. This makes WAL eventually uploaded to
+s3 when activity is stopped.
+
+The default is 10m.
+
 #### compaction_period

 Every `compaction_period` seconds, the page server checks if
@@ -154,7 +162,7 @@ The default distrib dir is `./tmp_install/`.
 #### workdir (-D)

 A directory in the file system, where pageserver will store its files.
-The default is `./.zenith/`.
+The default is `./.neon/`.

 This parameter has a special CLI alias (`-D`) and can not be overridden with regular `-c` way.

--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -28,7 +28,7 @@ The pageserver has a few different duties:
 - Receive WAL from the WAL service and decode it.
 - Replay WAL that's applicable to the chunks that the Page Server maintains

-For more detailed info, see [/pageserver/README](/pageserver/README.md)
+For more detailed info, see [pageserver-services.md](./pageserver-services.md)

 `/proxy`:

@@ -57,7 +57,7 @@ PostgreSQL extension that contains functions needed for testing and debugging.
 The zenith WAL service that receives WAL from a primary compute nodes and streams it to the pageserver.
 It acts as a holding area and redistribution center for recently generated WAL.

-For more detailed info, see [/safekeeper/README](/safekeeper/README.md)
+For more detailed info, see [walservice.md](./walservice.md)

 `/workspace_hack`:
 The workspace_hack crate exists only to pin down some dependencies.
--- a/safekeeper/README.md
+++ b/safekeeper/README.md
@@ -75,8 +75,8 @@ safekeepers. The Paxos and crash recovery algorithm ensures that only
 one primary node can be actively streaming WAL to the quorum of
 safekeepers.

-See README_PROTO.md for a more detailed description of the consensus
-protocol. spec/ contains TLA+ specification of it.
+See [this section](safekeeper-protocol.md) for a more detailed description of
+the consensus protocol. spec/ contains TLA+ specification of it.

 # Q&A

--- a/libs/etcd_broker/Cargo.toml
+++ b/libs/etcd_broker/Cargo.toml
@@ -9,6 +9,7 @@
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_with = "1.12.0"
+ once_cell = "1.13.0"

 utils = { path = "../utils" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/etcd_broker/src/lib.rs
+++ b/libs/etcd_broker/src/lib.rs
@@ -1,350 +1,209 @@
 //! A set of primitives to access a shared data/updates, propagated via etcd broker (not persistent).
 //! Intended to connect services to each other, not to store their data.
-use std::{
-    collections::{hash_map, HashMap},
-    fmt::Display,
-    str::FromStr,
-};

-use regex::{Captures, Regex};
-use serde::{Deserialize, Serialize};
-use serde_with::{serde_as, DisplayFromStr};
+/// All broker keys, that are used when dealing with etcd.
+pub mod subscription_key;
+/// All broker values, possible to use when dealing with etcd.
+pub mod subscription_value;

-pub use etcd_client::*;
+use std::str::FromStr;

+use serde::de::DeserializeOwned;
+
+use subscription_key::SubscriptionKey;
 use tokio::{sync::mpsc, task::JoinHandle};
 use tracing::*;
-use utils::{
-    lsn::Lsn,
-    zid::{NodeId, ZTenantId, ZTenantTimelineId},
-};
+
+use crate::subscription_key::SubscriptionFullKey;
+
+pub use etcd_client::*;

 /// Default value to use for prefixing to all etcd keys with.
 /// This way allows isolating safekeeper/pageserver groups in the same etcd cluster.
 pub const DEFAULT_NEON_BROKER_ETCD_PREFIX: &str = "neon";

-#[derive(Debug, Deserialize, Serialize)]
-struct SafekeeperTimeline {
-    safekeeper_id: NodeId,
-    info: SkTimelineInfo,
+/// A way to control the data retrieval from a certain subscription.
+pub struct BrokerSubscription<V> {
+    /// An unbounded channel to fetch the relevant etcd updates from.
+    pub value_updates: mpsc::UnboundedReceiver<BrokerUpdate<V>>,
+    key: SubscriptionKey,
+    /// A subscription task handle, to allow waiting on it for the task to complete.
+    /// Both the updates channel and the handle require `&mut`, so it's better to keep
+    /// both `pub` to allow using both in the same structures without borrow checker complaining.
+    pub watcher_handle: JoinHandle<Result<(), BrokerError>>,
+    watcher: Watcher,
 }

-/// Published data about safekeeper's timeline. Fields made optional for easy migrations.
-#[serde_as]
-#[derive(Debug, Clone, Deserialize, Serialize)]
-pub struct SkTimelineInfo {
-    /// Term of the last entry.
-    pub last_log_term: Option<u64>,
-    /// LSN of the last record.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub flush_lsn: Option<Lsn>,
-    /// Up to which LSN safekeeper regards its WAL as committed.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub commit_lsn: Option<Lsn>,
-    /// LSN up to which safekeeper has backed WAL.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub backup_lsn: Option<Lsn>,
-    /// LSN of last checkpoint uploaded by pageserver.
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub remote_consistent_lsn: Option<Lsn>,
-    #[serde_as(as = "Option<DisplayFromStr>")]
-    #[serde(default)]
-    pub peer_horizon_lsn: Option<Lsn>,
-    #[serde(default)]
-    pub safekeeper_connstr: Option<String>,
-    #[serde(default)]
-    pub pageserver_connstr: Option<String>,
+impl<V> BrokerSubscription<V> {
+    /// Cancels the subscription, stopping the data poller and waiting for it to shut down.
+    pub async fn cancel(mut self) -> Result<(), BrokerError> {
+        self.watcher.cancel().await.map_err(|e| {
+            BrokerError::EtcdClient(
+                e,
+                format!("Failed to cancel broker subscription, kind: {:?}", self.key),
+            )
+        })?;
+        match (&mut self.watcher_handle).await {
+            Ok(res) => res,
+            Err(e) => {
+                if e.is_cancelled() {
+                    // don't error on the tasks that are cancelled already
+                    Ok(())
+                } else {
+                    Err(BrokerError::InternalError(format!(
+                        "Panicked during broker subscription task, kind: {:?}, error: {e}",
+                        self.key
+                    )))
+                }
+            }
+        }
+    }
+}
+
+impl<V> Drop for BrokerSubscription<V> {
+    fn drop(&mut self) {
+        // we poll data from etcd into the channel in the same struct, so if the whole struct gets dropped,
+        // no more data is used by the receiver and it's safe to cancel and drop the whole etcd subscription task.
+        self.watcher_handle.abort();
+    }
+}
+
+/// An update from the etcd broker.
+pub struct BrokerUpdate<V> {
+    /// Etcd generation version, the bigger the more actual the data is.
+    pub etcd_version: i64,
+    /// Etcd key for the corresponding value, parsed from the broker KV.
+    pub key: SubscriptionFullKey,
+    /// Current etcd value, parsed from the broker KV.
+    pub value: V,
 }

 #[derive(Debug, thiserror::Error)]
 pub enum BrokerError {
    #[error("Etcd client error: {0}. Context: {1}")]
    EtcdClient(etcd_client::Error, String),
-    #[error("Error during parsing etcd data: {0}")]
-    ParsingError(String),
+    #[error("Error during parsing etcd key: {0}")]
+    KeyNotParsed(String),
    #[error("Internal error: {0}")]
    InternalError(String),
 }

-/// A way to control the data retrieval from a certain subscription.
-pub struct SkTimelineSubscription {
-    safekeeper_timeline_updates:
-        mpsc::UnboundedReceiver<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>>,
-    kind: SkTimelineSubscriptionKind,
-    watcher_handle: JoinHandle<Result<(), BrokerError>>,
-    watcher: Watcher,
-}
-
-impl SkTimelineSubscription {
-    /// Asynchronously polls for more data from the subscription, suspending the current future if there's no data sent yet.
-    pub async fn fetch_data(
-        &mut self,
-    ) -> Option<HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>>> {
-        self.safekeeper_timeline_updates.recv().await
-    }
-
-    /// Cancels the subscription, stopping the data poller and waiting for it to shut down.
-    pub async fn cancel(mut self) -> Result<(), BrokerError> {
-        self.watcher.cancel().await.map_err(|e| {
-            BrokerError::EtcdClient(
-                e,
-                format!(
-                    "Failed to cancel timeline subscription, kind: {:?}",
-                    self.kind
-                ),
-            )
-        })?;
-        self.watcher_handle.await.map_err(|e| {
-            BrokerError::InternalError(format!(
-                "Failed to join the timeline updates task, kind: {:?}, error: {e}",
-                self.kind
-            ))
-        })?
-    }
-}
-
-/// The subscription kind to the timeline updates from safekeeper.
-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub struct SkTimelineSubscriptionKind {
-    broker_etcd_prefix: String,
-    kind: SubscriptionKind,
-}
-
-impl SkTimelineSubscriptionKind {
-    pub fn all(broker_etcd_prefix: String) -> Self {
-        Self {
-            broker_etcd_prefix,
-            kind: SubscriptionKind::All,
-        }
-    }
-
-    pub fn tenant(broker_etcd_prefix: String, tenant: ZTenantId) -> Self {
-        Self {
-            broker_etcd_prefix,
-            kind: SubscriptionKind::Tenant(tenant),
-        }
-    }
-
-    pub fn timeline(broker_etcd_prefix: String, timeline: ZTenantTimelineId) -> Self {
-        Self {
-            broker_etcd_prefix,
-            kind: SubscriptionKind::Timeline(timeline),
-        }
-    }
-
-    fn watch_regex(&self) -> Regex {
-        match self.kind {
-            SubscriptionKind::All => Regex::new(&format!(
-                r"^{}/([[:xdigit:]]+)/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
-                self.broker_etcd_prefix
-            ))
-            .expect("wrong regex for 'everything' subscription"),
-            SubscriptionKind::Tenant(tenant_id) => Regex::new(&format!(
-                r"^{}/{tenant_id}/([[:xdigit:]]+)/safekeeper/([[:digit:]])$",
-                self.broker_etcd_prefix
-            ))
-            .expect("wrong regex for 'tenant' subscription"),
-            SubscriptionKind::Timeline(ZTenantTimelineId {
-                tenant_id,
-                timeline_id,
-            }) => Regex::new(&format!(
-                r"^{}/{tenant_id}/{timeline_id}/safekeeper/([[:digit:]])$",
-                self.broker_etcd_prefix
-            ))
-            .expect("wrong regex for 'timeline' subscription"),
-        }
-    }
-
-    /// Etcd key to use for watching a certain timeline updates from safekeepers.
-    pub fn watch_key(&self) -> String {
-        match self.kind {
-            SubscriptionKind::All => self.broker_etcd_prefix.to_string(),
-            SubscriptionKind::Tenant(tenant_id) => {
-                format!("{}/{tenant_id}/safekeeper", self.broker_etcd_prefix)
-            }
-            SubscriptionKind::Timeline(ZTenantTimelineId {
-                tenant_id,
-                timeline_id,
-            }) => format!(
-                "{}/{tenant_id}/{timeline_id}/safekeeper",
-                self.broker_etcd_prefix
-            ),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-enum SubscriptionKind {
-    /// Get every timeline update.
-    All,
-    /// Get certain tenant timelines' updates.
-    Tenant(ZTenantId),
-    /// Get certain timeline updates.
-    Timeline(ZTenantTimelineId),
-}
-
 /// Creates a background task to poll etcd for timeline updates from safekeepers.
 /// Stops and returns `Err` on any error during etcd communication.
 /// Watches the key changes until either the watcher is cancelled via etcd or the subscription cancellation handle,
 /// exiting normally in such cases.
-pub async fn subscribe_to_safekeeper_timeline_updates(
+/// Etcd values are parsed as json fukes into a type, specified in the generic patameter.
+pub async fn subscribe_for_json_values<V>(
    client: &mut Client,
-    subscription: SkTimelineSubscriptionKind,
-) -> Result<SkTimelineSubscription, BrokerError> {
-    info!("Subscribing to timeline updates, subscription kind: {subscription:?}");
+    key: SubscriptionKey,
+) -> Result<BrokerSubscription<V>, BrokerError>
+where
+    V: DeserializeOwned + Send + 'static,
+{
+    subscribe_for_values(client, key, |_, value_str| {
+        match serde_json::from_str::<V>(value_str) {
+            Ok(value) => Some(value),
+            Err(e) => {
+                error!("Failed to parse value str '{value_str}': {e}");
+                None
+            }
+        }
+    })
+    .await
+}
+
+/// Same as [`subscribe_for_json_values`], but allows to specify a custom parser of a etcd value string.
+pub async fn subscribe_for_values<P, V>(
+    client: &mut Client,
+    key: SubscriptionKey,
+    value_parser: P,
+) -> Result<BrokerSubscription<V>, BrokerError>
+where
+    V: Send + 'static,
+    P: Fn(SubscriptionFullKey, &str) -> Option<V> + Send + 'static,
+{
+    info!("Subscribing to broker value updates, key: {key:?}");
+    let subscription_key = key.clone();

    let (watcher, mut stream) = client
-        .watch(
-            subscription.watch_key(),
-            Some(WatchOptions::new().with_prefix()),
-        )
+        .watch(key.watch_key(), Some(WatchOptions::new().with_prefix()))
        .await
        .map_err(|e| {
            BrokerError::EtcdClient(
                e,
-                format!("Failed to init the watch for subscription {subscription:?}"),
+                format!("Failed to init the watch for subscription {key:?}"),
            )
        })?;

-    let (timeline_updates_sender, safekeeper_timeline_updates) = mpsc::unbounded_channel();
-
-    let subscription_kind = subscription.kind;
-    let regex = subscription.watch_regex();
+    let (value_updates_sender, value_updates_receiver) = mpsc::unbounded_channel();
    let watcher_handle = tokio::spawn(async move {
        while let Some(resp) = stream.message().await.map_err(|e| BrokerError::InternalError(format!(
-            "Failed to get messages from the subscription stream, kind: {subscription_kind:?}, error: {e}"
+            "Failed to get messages from the subscription stream, kind: {:?}, error: {e}", key.kind
        )))? {
            if resp.canceled() {
                info!("Watch for timeline updates subscription was canceled, exiting");
                break;
            }

-            let mut timeline_updates: HashMap<ZTenantTimelineId, HashMap<NodeId, SkTimelineInfo>> = HashMap::new();
-            // Keep track that the timeline data updates from etcd arrive in the right order.
-            // https://etcd.io/docs/v3.5/learning/api_guarantees/#isolation-level-and-consistency-of-replicas
-            // > etcd does not ensure linearizability for watch operations. Users are expected to verify the revision of watch responses to ensure correct ordering.
-            let mut timeline_etcd_versions: HashMap<ZTenantTimelineId, i64> = HashMap::new();
-
-
            let events = resp.events();
            debug!("Processing {} events", events.len());

            for event in events {
                if EventType::Put == event.event_type() {
                    if let Some(new_etcd_kv) = event.kv() {
-                        let new_kv_version = new_etcd_kv.version();
-
-                        match parse_etcd_key_value(subscription_kind, &regex, new_etcd_kv) {
-                            Ok(Some((zttid, timeline))) => {
-                                match timeline_updates
-                                    .entry(zttid)
-                                    .or_default()
-                                    .entry(timeline.safekeeper_id)
-                                {
-                                    hash_map::Entry::Occupied(mut o) => {
-                                        let old_etcd_kv_version = timeline_etcd_versions.get(&zttid).copied().unwrap_or(i64::MIN);
-                                        if old_etcd_kv_version < new_kv_version {
-                                            o.insert(timeline.info);
-                                            timeline_etcd_versions.insert(zttid,new_kv_version);
-                                        }
-                                    }
-                                    hash_map::Entry::Vacant(v) => {
-                                        v.insert(timeline.info);
-                                        timeline_etcd_versions.insert(zttid,new_kv_version);
-                                    }
-                                }
-                            }
-                            Ok(None) => {}
-                            Err(e) => error!("Failed to parse timeline update: {e}"),
+                        match parse_etcd_kv(new_etcd_kv, &value_parser, &key.cluster_prefix) {
+                            Ok(Some((key, value))) => if let Err(e) = value_updates_sender.send(BrokerUpdate {
+                                etcd_version: new_etcd_kv.version(),
+                                key,
+                                value,
+                            }) {
+                                info!("Broker value updates for key {key:?} sender got dropped, exiting: {e}");
+                                break;
+                            },
+                            Ok(None) => debug!("Ignoring key {key:?} : no value was returned by the parser"),
+                            Err(BrokerError::KeyNotParsed(e)) => debug!("Unexpected key {key:?} for timeline update: {e}"),
+                            Err(e) => error!("Failed to represent etcd KV {new_etcd_kv:?}: {e}"),
                        };
                    }
                }
            }
-
-            if let Err(e) = timeline_updates_sender.send(timeline_updates) {
-                info!("Timeline updates sender got dropped, exiting: {e}");
-                break;
-            }
        }

        Ok(())
-    });
+    }.instrument(info_span!("etcd_broker")));

-    Ok(SkTimelineSubscription {
-        kind: subscription,
-        safekeeper_timeline_updates,
+    Ok(BrokerSubscription {
+        key: subscription_key,
+        value_updates: value_updates_receiver,
        watcher_handle,
        watcher,
    })
 }

-fn parse_etcd_key_value(
-    subscription_kind: SubscriptionKind,
-    regex: &Regex,
+fn parse_etcd_kv<P, V>(
    kv: &KeyValue,
-) -> Result<Option<(ZTenantTimelineId, SafekeeperTimeline)>, BrokerError> {
-    let caps = if let Some(caps) = regex.captures(kv.key_str().map_err(|e| {
-        BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as key str"))
-    })?) {
-        caps
-    } else {
-        return Ok(None);
-    };
-
-    let (zttid, safekeeper_id) = match subscription_kind {
-        SubscriptionKind::All => (
-            ZTenantTimelineId::new(
-                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
-                parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?,
-            ),
-            NodeId(parse_capture(&caps, 3).map_err(BrokerError::ParsingError)?),
-        ),
-        SubscriptionKind::Tenant(tenant_id) => (
-            ZTenantTimelineId::new(
-                tenant_id,
-                parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?,
-            ),
-            NodeId(parse_capture(&caps, 2).map_err(BrokerError::ParsingError)?),
-        ),
-        SubscriptionKind::Timeline(zttid) => (
-            zttid,
-            NodeId(parse_capture(&caps, 1).map_err(BrokerError::ParsingError)?),
-        ),
-    };
-
-    let info_str = kv.value_str().map_err(|e| {
-        BrokerError::EtcdClient(e, format!("Failed to represent kv {kv:?} as value str"))
-    })?;
-    Ok(Some((
-        zttid,
-        SafekeeperTimeline {
-            safekeeper_id,
-            info: serde_json::from_str(info_str).map_err(|e| {
-                BrokerError::ParsingError(format!(
-                    "Failed to parse '{info_str}' as safekeeper timeline info: {e}"
-                ))
-            })?,
-        },
-    )))
-}
-
-fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
+    value_parser: &P,
+    cluster_prefix: &str,
+) -> Result<Option<(SubscriptionFullKey, V)>, BrokerError>
 where
-    T: FromStr,
-    <T as FromStr>::Err: Display,
+    P: Fn(SubscriptionFullKey, &str) -> Option<V>,
 {
-    let capture_match = caps
-        .get(index)
-        .ok_or_else(|| format!("Failed to get capture match at index {index}"))?
-        .as_str();
-    capture_match.parse().map_err(|e| {
-        format!(
-            "Failed to parse {} from {capture_match}: {e}",
-            std::any::type_name::<T>()
-        )
-    })
+    let key_str = kv.key_str().map_err(|e| {
+        BrokerError::EtcdClient(e, "Failed to extract key str out of etcd KV".to_string())
+    })?;
+    let value_str = kv.value_str().map_err(|e| {
+        BrokerError::EtcdClient(e, "Failed to extract value str out of etcd KV".to_string())
+    })?;
+
+    if !key_str.starts_with(cluster_prefix) {
+        return Err(BrokerError::KeyNotParsed(format!(
+            "KV has unexpected key '{key_str}' that does not start with cluster prefix {cluster_prefix}"
+        )));
+    }
+
+    let key = SubscriptionFullKey::from_str(&key_str[cluster_prefix.len()..]).map_err(|e| {
+        BrokerError::KeyNotParsed(format!("Failed to parse KV key '{key_str}': {e}"))
+    })?;
+
+    Ok(value_parser(key, value_str).map(|value| (key, value)))
 }
--- a/libs/etcd_broker/src/subscription_key.rs
+++ b/libs/etcd_broker/src/subscription_key.rs
@@ -0,0 +1,310 @@
+//! Etcd broker keys, used in the project and shared between instances.
+//! The keys are split into two categories:
+//!
+//! * [`SubscriptionFullKey`] full key format: `<cluster_prefix>/<tenant>/<timeline>/<node_kind>/<operation>/<node_id>`
+//! Always returned from etcd in this form, always start with the user key provided.
+//!
+//! * [`SubscriptionKey`] user input key format: always partial, since it's unknown which `node_id`'s are available.
+//! Full key always starts with the user input one, due to etcd subscription properties.
+
+use std::{fmt::Display, str::FromStr};
+
+use once_cell::sync::Lazy;
+use regex::{Captures, Regex};
+use utils::zid::{NodeId, ZTenantId, ZTenantTimelineId};
+
+/// The subscription kind to the timeline updates from safekeeper.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct SubscriptionKey {
+    /// Generic cluster prefix, allowing to use the same etcd instance by multiple logic groups.
+    pub cluster_prefix: String,
+    /// The subscription kind.
+    pub kind: SubscriptionKind,
+}
+
+/// All currently possible key kinds of a etcd broker subscription.
+/// Etcd works so, that every key that starts with the subbscription key given is considered matching and
+/// returned as part of the subscrption.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum SubscriptionKind {
+    /// Get every update in etcd.
+    All,
+    /// Get etcd updates for any timeiline of a certain tenant, affected by any operation from any node kind.
+    TenantTimelines(ZTenantId),
+    /// Get etcd updates for a certain timeline of a tenant, affected by any operation from any node kind.
+    Timeline(ZTenantTimelineId),
+    /// Get etcd timeline updates, specific to a certain node kind.
+    Node(ZTenantTimelineId, NodeKind),
+    /// Get etcd timeline updates for a certain operation on specific nodes.
+    Operation(ZTenantTimelineId, NodeKind, OperationKind),
+}
+
+/// All kinds of nodes, able to write into etcd.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum NodeKind {
+    Safekeeper,
+    Pageserver,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum OperationKind {
+    Safekeeper(SkOperationKind),
+}
+
+/// Current operations, running inside the safekeeper node.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum SkOperationKind {
+    TimelineInfo,
+    WalBackup,
+}
+
+static SUBSCRIPTION_FULL_KEY_REGEX: Lazy<Regex> = Lazy::new(|| {
+    Regex::new("/([[:xdigit:]]+)/([[:xdigit:]]+)/([^/]+)/([^/]+)/([[:digit:]]+)$")
+        .expect("wrong subscription full etcd key regex")
+});
+
+/// Full key, received from etcd during any of the component's work.
+/// No other etcd keys are considered during system's work.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SubscriptionFullKey {
+    pub id: ZTenantTimelineId,
+    pub node_kind: NodeKind,
+    pub operation: OperationKind,
+    pub node_id: NodeId,
+}
+
+impl SubscriptionKey {
+    /// Subscribes for all etcd updates.
+    pub fn all(cluster_prefix: String) -> Self {
+        SubscriptionKey {
+            cluster_prefix,
+            kind: SubscriptionKind::All,
+        }
+    }
+
+    /// Subscribes to a given timeline info updates from safekeepers.
+    pub fn sk_timeline_info(cluster_prefix: String, timeline: ZTenantTimelineId) -> Self {
+        Self {
+            cluster_prefix,
+            kind: SubscriptionKind::Operation(
+                timeline,
+                NodeKind::Safekeeper,
+                OperationKind::Safekeeper(SkOperationKind::TimelineInfo),
+            ),
+        }
+    }
+
+    /// Subscribes to all timeine updates during specific operations, running on the corresponding nodes.
+    pub fn operation(
+        cluster_prefix: String,
+        timeline: ZTenantTimelineId,
+        node_kind: NodeKind,
+        operation: OperationKind,
+    ) -> Self {
+        Self {
+            cluster_prefix,
+            kind: SubscriptionKind::Operation(timeline, node_kind, operation),
+        }
+    }
+
+    /// Etcd key to use for watching a certain timeline updates from safekeepers.
+    pub fn watch_key(&self) -> String {
+        let cluster_prefix = &self.cluster_prefix;
+        match self.kind {
+            SubscriptionKind::All => cluster_prefix.to_string(),
+            SubscriptionKind::TenantTimelines(tenant_id) => {
+                format!("{cluster_prefix}/{tenant_id}")
+            }
+            SubscriptionKind::Timeline(id) => {
+                format!("{cluster_prefix}/{id}")
+            }
+            SubscriptionKind::Node(id, node_kind) => {
+                format!("{cluster_prefix}/{id}/{node_kind}")
+            }
+            SubscriptionKind::Operation(id, node_kind, operation_kind) => {
+                format!("{cluster_prefix}/{id}/{node_kind}/{operation_kind}")
+            }
+        }
+    }
+}
+
+impl Display for OperationKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            OperationKind::Safekeeper(o) => o.fmt(f),
+        }
+    }
+}
+
+impl FromStr for OperationKind {
+    type Err = String;
+
+    fn from_str(operation_kind_str: &str) -> Result<Self, Self::Err> {
+        match operation_kind_str {
+            "timeline_info" => Ok(OperationKind::Safekeeper(SkOperationKind::TimelineInfo)),
+            "wal_backup" => Ok(OperationKind::Safekeeper(SkOperationKind::WalBackup)),
+            _ => Err(format!("Unknown operation kind: {operation_kind_str}")),
+        }
+    }
+}
+
+impl Display for SubscriptionFullKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let Self {
+            id,
+            node_kind,
+            operation,
+            node_id,
+        } = self;
+        write!(f, "{id}/{node_kind}/{operation}/{node_id}")
+    }
+}
+
+impl FromStr for SubscriptionFullKey {
+    type Err = String;
+
+    fn from_str(subscription_kind_str: &str) -> Result<Self, Self::Err> {
+        let key_captures = match SUBSCRIPTION_FULL_KEY_REGEX.captures(subscription_kind_str) {
+            Some(captures) => captures,
+            None => {
+                return Err(format!(
+                    "Subscription kind str does not match a subscription full key regex {}",
+                    SUBSCRIPTION_FULL_KEY_REGEX.as_str()
+                ));
+            }
+        };
+
+        Ok(Self {
+            id: ZTenantTimelineId::new(
+                parse_capture(&key_captures, 1)?,
+                parse_capture(&key_captures, 2)?,
+            ),
+            node_kind: parse_capture(&key_captures, 3)?,
+            operation: parse_capture(&key_captures, 4)?,
+            node_id: NodeId(parse_capture(&key_captures, 5)?),
+        })
+    }
+}
+
+fn parse_capture<T>(caps: &Captures, index: usize) -> Result<T, String>
+where
+    T: FromStr,
+    <T as FromStr>::Err: Display,
+{
+    let capture_match = caps
+        .get(index)
+        .ok_or_else(|| format!("Failed to get capture match at index {index}"))?
+        .as_str();
+    capture_match.parse().map_err(|e| {
+        format!(
+            "Failed to parse {} from {capture_match}: {e}",
+            std::any::type_name::<T>()
+        )
+    })
+}
+
+impl Display for NodeKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Safekeeper => write!(f, "safekeeper"),
+            Self::Pageserver => write!(f, "pageserver"),
+        }
+    }
+}
+
+impl FromStr for NodeKind {
+    type Err = String;
+
+    fn from_str(node_kind_str: &str) -> Result<Self, Self::Err> {
+        match node_kind_str {
+            "safekeeper" => Ok(Self::Safekeeper),
+            "pageserver" => Ok(Self::Pageserver),
+            _ => Err(format!("Invalid node kind: {node_kind_str}")),
+        }
+    }
+}
+
+impl Display for SkOperationKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::TimelineInfo => write!(f, "timeline_info"),
+            Self::WalBackup => write!(f, "wal_backup"),
+        }
+    }
+}
+
+impl FromStr for SkOperationKind {
+    type Err = String;
+
+    fn from_str(operation_str: &str) -> Result<Self, Self::Err> {
+        match operation_str {
+            "timeline_info" => Ok(Self::TimelineInfo),
+            "wal_backup" => Ok(Self::WalBackup),
+            _ => Err(format!("Invalid operation: {operation_str}")),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use utils::zid::ZTimelineId;
+
+    use super::*;
+
+    #[test]
+    fn full_cluster_key_parsing() {
+        let prefix = "neon";
+        let node_kind = NodeKind::Safekeeper;
+        let operation_kind = OperationKind::Safekeeper(SkOperationKind::WalBackup);
+        let tenant_id = ZTenantId::generate();
+        let timeline_id = ZTimelineId::generate();
+        let id = ZTenantTimelineId::new(tenant_id, timeline_id);
+        let node_id = NodeId(1);
+
+        let timeline_subscription_keys = [
+            SubscriptionKey {
+                cluster_prefix: prefix.to_string(),
+                kind: SubscriptionKind::All,
+            },
+            SubscriptionKey {
+                cluster_prefix: prefix.to_string(),
+                kind: SubscriptionKind::TenantTimelines(tenant_id),
+            },
+            SubscriptionKey {
+                cluster_prefix: prefix.to_string(),
+                kind: SubscriptionKind::Timeline(id),
+            },
+            SubscriptionKey {
+                cluster_prefix: prefix.to_string(),
+                kind: SubscriptionKind::Node(id, node_kind),
+            },
+            SubscriptionKey {
+                cluster_prefix: prefix.to_string(),
+                kind: SubscriptionKind::Operation(id, node_kind, operation_kind),
+            },
+        ];
+
+        let full_key_string = format!(
+            "{}/{node_id}",
+            timeline_subscription_keys.last().unwrap().watch_key()
+        );
+
+        for key in timeline_subscription_keys {
+            assert!(full_key_string.starts_with(&key.watch_key()), "Full key '{full_key_string}' should start with any of the keys, keys, but {key:?} did not match");
+        }
+
+        let full_key = SubscriptionFullKey::from_str(&full_key_string).unwrap_or_else(|e| {
+            panic!("Failed to parse {full_key_string} as a subscription full key: {e}")
+        });
+
+        assert_eq!(
+            full_key,
+            SubscriptionFullKey {
+                id,
+                node_kind,
+                operation: operation_kind,
+                node_id
+            }
+        )
+    }
+}
--- a/libs/etcd_broker/src/subscription_value.rs
+++ b/libs/etcd_broker/src/subscription_value.rs
@@ -0,0 +1,35 @@
+//! Module for the values to put into etcd.
+
+use serde::{Deserialize, Serialize};
+use serde_with::{serde_as, DisplayFromStr};
+use utils::lsn::Lsn;
+
+/// Data about safekeeper's timeline. Fields made optional for easy migrations.
+#[serde_as]
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct SkTimelineInfo {
+    /// Term of the last entry.
+    pub last_log_term: Option<u64>,
+    /// LSN of the last record.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub flush_lsn: Option<Lsn>,
+    /// Up to which LSN safekeeper regards its WAL as committed.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub commit_lsn: Option<Lsn>,
+    /// LSN up to which safekeeper has backed WAL.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub backup_lsn: Option<Lsn>,
+    /// LSN of last checkpoint uploaded by pageserver.
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub remote_consistent_lsn: Option<Lsn>,
+    #[serde_as(as = "Option<DisplayFromStr>")]
+    #[serde(default)]
+    pub peer_horizon_lsn: Option<Lsn>,
+    /// A connection string to use for WAL receiving.
+    #[serde(default)]
+    pub safekeeper_connstr: Option<String>,
+}
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -6,6 +6,5 @@ edition = "2021"
 [dependencies]
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 libc = "0.2"
-lazy_static = "1.4"
-once_cell = "1.8.0"
+once_cell = "1.13.0"
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,7 +2,10 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;
+use prometheus::core::{AtomicU64, GenericGauge, GenericGaugeVec};
+pub use prometheus::opts;
+pub use prometheus::register;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_gauge, Gauge};
@@ -18,6 +21,17 @@ pub use prometheus::{Encoder, TextEncoder};
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};

+pub type UIntGauge = GenericGauge<AtomicU64>;
+pub type UIntGaugeVec = GenericGaugeVec<AtomicU64>;
+
+#[macro_export]
+macro_rules! register_uint_gauge_vec {
+    ($NAME:expr, $HELP:expr, $LABELS_NAMES:expr $(,)?) => {{
+        let gauge_vec = UIntGaugeVec::new($crate::opts!($NAME, $HELP), $LABELS_NAMES).unwrap();
+        $crate::register(Box::new(gauge_vec.clone())).map(|_| gauge_vec)
+    }};
+}
+
 /// Gathers all Prometheus metrics and records the I/O stats just before that.
 ///
 /// Metrics gathering is a relatively simple and standalone operation, so
@@ -27,19 +41,22 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    prometheus::gather()
 }

-lazy_static! {
-    static ref DISK_IO_BYTES: IntGaugeVec = register_int_gauge_vec!(
+static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
        "libmetrics_disk_io_bytes_total",
        "Bytes written and read from disk, grouped by the operation (read|write)",
        &["io_operation"]
    )
-    .expect("Failed to register disk i/o bytes int gauge vec");
-    static ref MAXRSS_KB: IntGauge = register_int_gauge!(
+    .expect("Failed to register disk i/o bytes int gauge vec")
+});
+
+static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
        "libmetrics_maxrss_kb",
        "Memory usage (Maximum Resident Set Size)"
    )
-    .expect("Failed to register maxrss_kb int gauge");
-}
+    .expect("Failed to register maxrss_kb int gauge")
+});

 pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
--- a/libs/metrics/src/wrappers.rs
+++ b/libs/metrics/src/wrappers.rs
@@ -10,13 +10,13 @@ use std::io::{Read, Result, Write};
 /// # use std::io::{Result, Read};
 /// # use metrics::{register_int_counter, IntCounter};
 /// # use metrics::CountedReader;
+/// # use once_cell::sync::Lazy;
 /// #
-/// # lazy_static::lazy_static! {
-/// #     static ref INT_COUNTER: IntCounter = register_int_counter!(
+/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
 /// #         "int_counter",
 /// #         "let's count something!"
-/// #     ).unwrap();
-/// # }
+/// #     ).unwrap()
+/// # });
 /// #
 /// fn do_some_reads(stream: impl Read, count: usize) -> Result<Vec<u8>> {
 ///     let mut reader = CountedReader::new(stream, |cnt| {
@@ -85,13 +85,13 @@ impl<T: Read> Read for CountedReader<'_, T> {
 /// # use std::io::{Result, Write};
 /// # use metrics::{register_int_counter, IntCounter};
 /// # use metrics::CountedWriter;
+/// # use once_cell::sync::Lazy;
 /// #
-/// # lazy_static::lazy_static! {
-/// #     static ref INT_COUNTER: IntCounter = register_int_counter!(
+/// # static INT_COUNTER: Lazy<IntCounter> = Lazy::new( || { register_int_counter!(
 /// #         "int_counter",
 /// #         "let's count something!"
-/// #     ).unwrap();
-/// # }
+/// #     ).unwrap()
+/// # });
 /// #
 /// fn do_some_writes(stream: impl Write, payload: &[u8]) -> Result<()> {
 ///     let mut writer = CountedWriter::new(stream, |cnt| {
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -12,7 +12,7 @@ byteorder = "1.4.3"
 anyhow = "1.0"
 crc32c = "0.6.0"
 hex = "0.4.3"
-lazy_static = "1.4"
+once_cell = "1.13.0"
 log = "0.4.14"
 memoffset = "0.6.2"
 thiserror = "1.0"
@@ -23,7 +23,7 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 [dev-dependencies]
 env_logger = "0.9"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
-wal_generate = { path = "wal_generate" }
+wal_craft = { path = "wal_craft" }

 [build-dependencies]
 bindgen = "0.59.1"
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -2,6 +2,7 @@ extern crate bindgen;

 use std::env;
 use std::path::PathBuf;
+use std::process::Command;

 use bindgen::callbacks::ParseCallbacks;

@@ -45,6 +46,43 @@ fn main() {
    // Tell cargo to invalidate the built crate whenever the wrapper changes
    println!("cargo:rerun-if-changed=pg_control_ffi.h");

+    // Finding the location of C headers for the Postgres server:
+    // - if POSTGRES_INSTALL_DIR is set look into it, otherwise look into `<project_root>/tmp_install`
+    // - if there's a `bin/pg_config` file use it for getting include server, otherwise use `<project_root>/tmp_install/include/postgresql/server`
+    let mut pg_install_dir = if let Some(postgres_install_dir) = env::var_os("POSTGRES_INSTALL_DIR")
+    {
+        postgres_install_dir.into()
+    } else {
+        PathBuf::from("tmp_install")
+    };
+
+    if pg_install_dir.is_relative() {
+        let cwd = env::current_dir().unwrap();
+        pg_install_dir = cwd.join("..").join("..").join(pg_install_dir);
+    }
+
+    let pg_config_bin = pg_install_dir.join("bin").join("pg_config");
+    let inc_server_path: String = if pg_config_bin.exists() {
+        let output = Command::new(pg_config_bin)
+            .arg("--includedir-server")
+            .output()
+            .expect("failed to execute `pg_config --includedir-server`");
+
+        if !output.status.success() {
+            panic!("`pg_config --includedir-server` failed")
+        }
+
+        String::from_utf8(output.stdout).unwrap().trim_end().into()
+    } else {
+        pg_install_dir
+            .join("include")
+            .join("postgresql")
+            .join("server")
+            .into_os_string()
+            .into_string()
+            .unwrap()
+    };
+
    // The bindgen::Builder is the main entry point
    // to bindgen, and lets you build up options for
    // the resulting bindings.
@@ -81,15 +119,7 @@ fn main() {
        // explicit padding fields.
        .explicit_padding(true)
        //
-        // Path the server include dir. It is in tmp_install/include/server, if you did
-        // "configure --prefix=<path to tmp_install>". But if you used "configure --prefix=/",
-        // and used DESTDIR to move it into tmp_install, then it's in
-        // tmp_install/include/postgres/server
-        // 'pg_config --includedir-server' would perhaps be the more proper way to find it,
-        // but this will do for now.
-        //
-        .clang_arg("-I../../tmp_install/include/server")
-        .clang_arg("-I../../tmp_install/include/postgresql/server")
+        .clang_arg(format!("-I{inc_server_path}"))
        //
        // Finish the builder and generate the bindings.
        //
--- a/libs/postgres_ffi/src/relfile_utils.rs
+++ b/libs/postgres_ffi/src/relfile_utils.rs
@@ -2,7 +2,7 @@
 //! Common utilities for dealing with PostgreSQL relation files.
 //!
 use crate::pg_constants;
-use lazy_static::lazy_static;
+use once_cell::sync::OnceCell;
 use regex::Regex;

 #[derive(Debug, Clone, thiserror::Error, PartialEq)]
@@ -54,11 +54,14 @@ pub fn forknumber_to_name(forknum: u8) -> Option<&'static str> {
 /// See functions relpath() and _mdfd_segpath() in PostgreSQL sources.
 ///
 pub fn parse_relfilename(fname: &str) -> Result<(u32, u8, u32), FilePathError> {
-    lazy_static! {
-        static ref RELFILE_RE: Regex =
-            Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap();
-    }
+    static RELFILE_RE: OnceCell<Regex> = OnceCell::new();
+    RELFILE_RE.get_or_init(|| {
+        Regex::new(r"^(?P<relnode>\d+)(_(?P<forkname>[a-z]+))?(\.(?P<segno>\d+))?$").unwrap()
+    });
+
    let caps = RELFILE_RE
+        .get()
+        .unwrap()
        .captures(fname)
        .ok_or(FilePathError::InvalidFileName)?;

--- a/libs/postgres_ffi/src/waldecoder.rs
+++ b/libs/postgres_ffi/src/waldecoder.rs
@@ -13,24 +13,30 @@ use super::xlog_utils::*;
 use super::XLogLongPageHeaderData;
 use super::XLogPageHeaderData;
 use super::XLogRecord;
+use super::XLOG_PAGE_MAGIC;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use crc32c::*;
 use log::*;
 use std::cmp::min;
+use std::num::NonZeroU32;
 use thiserror::Error;
 use utils::lsn::Lsn;

+enum State {
+    WaitingForRecord,
+    ReassemblingRecord {
+        recordbuf: BytesMut,
+        contlen: NonZeroU32,
+    },
+    SkippingEverything {
+        skip_until_lsn: Lsn,
+    },
+}
+
 pub struct WalStreamDecoder {
    lsn: Lsn,
-
-    startlsn: Lsn, // LSN where this record starts
-    contlen: u32,
-    padlen: u32,
-
    inputbuf: BytesMut,
-
-    /// buffer used to reassemble records that cross page boundaries.
-    recordbuf: BytesMut,
+    state: State,
 }

 #[derive(Error, Debug, Clone)]
@@ -48,13 +54,8 @@ impl WalStreamDecoder {
    pub fn new(lsn: Lsn) -> WalStreamDecoder {
        WalStreamDecoder {
            lsn,
-
-            startlsn: Lsn(0),
-            contlen: 0,
-            padlen: 0,
-
            inputbuf: BytesMut::new(),
-            recordbuf: BytesMut::new(),
+            state: State::WaitingForRecord,
        }
    }

@@ -67,6 +68,58 @@ impl WalStreamDecoder {
        self.inputbuf.extend_from_slice(buf);
    }

+    fn validate_page_header(&self, hdr: &XLogPageHeaderData) -> Result<(), WalDecodeError> {
+        let validate_impl = || {
+            if hdr.xlp_magic != XLOG_PAGE_MAGIC as u16 {
+                return Err(format!(
+                    "invalid xlog page header: xlp_magic={}, expected {}",
+                    hdr.xlp_magic, XLOG_PAGE_MAGIC
+                ));
+            }
+            if hdr.xlp_pageaddr != self.lsn.0 {
+                return Err(format!(
+                    "invalid xlog page header: xlp_pageaddr={}, expected {}",
+                    hdr.xlp_pageaddr, self.lsn
+                ));
+            }
+            match self.state {
+                State::WaitingForRecord => {
+                    if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD != 0 {
+                        return Err(
+                            "invalid xlog page header: unexpected XLP_FIRST_IS_CONTRECORD".into(),
+                        );
+                    }
+                    if hdr.xlp_rem_len != 0 {
+                        return Err(format!(
+                            "invalid xlog page header: xlp_rem_len={}, but it's not a contrecord",
+                            hdr.xlp_rem_len
+                        ));
+                    }
+                }
+                State::ReassemblingRecord { contlen, .. } => {
+                    if hdr.xlp_info & XLP_FIRST_IS_CONTRECORD == 0 {
+                        return Err(
+                            "invalid xlog page header: XLP_FIRST_IS_CONTRECORD expected, not found"
+                                .into(),
+                        );
+                    }
+                    if hdr.xlp_rem_len != contlen.get() {
+                        return Err(format!(
+                            "invalid xlog page header: xlp_rem_len={}, expected {}",
+                            hdr.xlp_rem_len,
+                            contlen.get()
+                        ));
+                    }
+                }
+                State::SkippingEverything { .. } => {
+                    panic!("Should not be validating page header in the SkippingEverything state");
+                }
+            };
+            Ok(())
+        };
+        validate_impl().map_err(|msg| WalDecodeError { msg, lsn: self.lsn })
+    }
+
    /// Attempt to decode another WAL record from the input that has been fed to the
    /// decoder so far.
    ///
@@ -76,127 +129,121 @@ impl WalStreamDecoder {
    ///     Err(WalDecodeError): an error occurred while decoding, meaning the input was invalid.
    ///
    pub fn poll_decode(&mut self) -> Result<Option<(Lsn, Bytes)>, WalDecodeError> {
-        let recordbuf;
-
        // Run state machine that validates page headers, and reassembles records
        // that cross page boundaries.
        loop {
            // parse and verify page boundaries as we go
-            if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
-                // parse long header
+            // However, we may have to skip some page headers if we're processing the XLOG_SWITCH record or skipping padding for whatever reason.
+            match self.state {
+                State::WaitingForRecord | State::ReassemblingRecord { .. } => {
+                    if self.lsn.segment_offset(pg_constants::WAL_SEGMENT_SIZE) == 0 {
+                        // parse long header

-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
-                    return Ok(None);
-                }
+                        if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_LONG_PHD {
+                            return Ok(None);
+                        }

-                let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
-                    WalDecodeError {
-                        msg: format!("long header deserialization failed {}", e),
-                        lsn: self.lsn,
+                        let hdr = XLogLongPageHeaderData::from_bytes(&mut self.inputbuf).map_err(
+                            |e| WalDecodeError {
+                                msg: format!("long header deserialization failed {}", e),
+                                lsn: self.lsn,
+                            },
+                        )?;
+
+                        self.validate_page_header(&hdr.std)?;
+
+                        self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+                    } else if self.lsn.block_offset() == 0 {
+                        if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
+                            return Ok(None);
+                        }
+
+                        let hdr =
+                            XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
+                                WalDecodeError {
+                                    msg: format!("header deserialization failed {}", e),
+                                    lsn: self.lsn,
+                                }
+                            })?;
+
+                        self.validate_page_header(&hdr)?;
+
+                        self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
                    }
-                })?;
-
-                if hdr.std.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog segment header".into(),
-                        lsn: self.lsn,
-                    });
                }
-                // TODO: verify the remaining fields in the header
-
-                self.lsn += XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
-                continue;
-            } else if self.lsn.block_offset() == 0 {
-                if self.inputbuf.remaining() < XLOG_SIZE_OF_XLOG_SHORT_PHD {
-                    return Ok(None);
-                }
-
-                let hdr = XLogPageHeaderData::from_bytes(&mut self.inputbuf).map_err(|e| {
-                    WalDecodeError {
-                        msg: format!("header deserialization failed {}", e),
-                        lsn: self.lsn,
+                State::SkippingEverything { .. } => {}
+            }
+            match &mut self.state {
+                State::WaitingForRecord => {
+                    // need to have at least the xl_tot_len field
+                    if self.inputbuf.remaining() < 4 {
+                        return Ok(None);
                    }
-                })?;

-                if hdr.xlp_pageaddr != self.lsn.0 {
-                    return Err(WalDecodeError {
-                        msg: "invalid xlog page header".into(),
-                        lsn: self.lsn,
-                    });
+                    // peek xl_tot_len at the beginning of the record.
+                    // FIXME: assumes little-endian
+                    let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
+                    if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
+                        return Err(WalDecodeError {
+                            msg: format!("invalid xl_tot_len {}", xl_tot_len),
+                            lsn: self.lsn,
+                        });
+                    }
+                    // Fast path for the common case that the whole record fits on the page.
+                    let pageleft = self.lsn.remaining_in_block() as u32;
+                    if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
+                        self.lsn += xl_tot_len as u64;
+                        let recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
+                        return Ok(Some(self.complete_record(recordbuf)?));
+                    } else {
+                        // Need to assemble the record from pieces. Remember the size of the
+                        // record, and loop back. On next iteration, we will reach the 'else'
+                        // branch below, and copy the part of the record that was on this page
+                        // to 'recordbuf'.  Subsequent iterations will skip page headers, and
+                        // append the continuations from the next pages to 'recordbuf'.
+                        self.state = State::ReassemblingRecord {
+                            recordbuf: BytesMut::with_capacity(xl_tot_len as usize),
+                            contlen: NonZeroU32::new(xl_tot_len).unwrap(),
+                        }
+                    }
                }
-                // TODO: verify the remaining fields in the header
+                State::ReassemblingRecord { recordbuf, contlen } => {
+                    // we're continuing a record, possibly from previous page.
+                    let pageleft = self.lsn.remaining_in_block() as u32;

-                self.lsn += XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
-                continue;
-            } else if self.padlen > 0 {
-                if self.inputbuf.remaining() < self.padlen as usize {
-                    return Ok(None);
+                    // read the rest of the record, or as much as fits on this page.
+                    let n = min(contlen.get(), pageleft) as usize;
+
+                    if self.inputbuf.remaining() < n {
+                        return Ok(None);
+                    }
+
+                    recordbuf.put(self.inputbuf.split_to(n));
+                    self.lsn += n as u64;
+                    *contlen = match NonZeroU32::new(contlen.get() - n as u32) {
+                        Some(x) => x,
+                        None => {
+                            // The record is now complete.
+                            let recordbuf = std::mem::replace(recordbuf, BytesMut::new()).freeze();
+                            return Ok(Some(self.complete_record(recordbuf)?));
+                        }
+                    }
                }
-
-                // skip padding
-                self.inputbuf.advance(self.padlen as usize);
-                self.lsn += self.padlen as u64;
-                self.padlen = 0;
-            } else if self.contlen == 0 {
-                assert!(self.recordbuf.is_empty());
-
-                // need to have at least the xl_tot_len field
-                if self.inputbuf.remaining() < 4 {
-                    return Ok(None);
+                State::SkippingEverything { skip_until_lsn } => {
+                    assert!(*skip_until_lsn >= self.lsn);
+                    let n = skip_until_lsn.0 - self.lsn.0;
+                    if self.inputbuf.remaining() < n as usize {
+                        return Ok(None);
+                    }
+                    self.inputbuf.advance(n as usize);
+                    self.lsn += n;
+                    self.state = State::WaitingForRecord;
                }
-
-                // peek xl_tot_len at the beginning of the record.
-                // FIXME: assumes little-endian
-                self.startlsn = self.lsn;
-                let xl_tot_len = (&self.inputbuf[0..4]).get_u32_le();
-                if (xl_tot_len as usize) < XLOG_SIZE_OF_XLOG_RECORD {
-                    return Err(WalDecodeError {
-                        msg: format!("invalid xl_tot_len {}", xl_tot_len),
-                        lsn: self.lsn,
-                    });
-                }
-
-                // Fast path for the common case that the whole record fits on the page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-                if self.inputbuf.remaining() >= xl_tot_len as usize && xl_tot_len <= pageleft {
-                    // Take the record from the 'inputbuf', and validate it.
-                    recordbuf = self.inputbuf.copy_to_bytes(xl_tot_len as usize);
-                    self.lsn += xl_tot_len as u64;
-                    break;
-                } else {
-                    // Need to assemble the record from pieces. Remember the size of the
-                    // record, and loop back. On next iteration, we will reach the 'else'
-                    // branch below, and copy the part of the record that was on this page
-                    // to 'recordbuf'.  Subsequent iterations will skip page headers, and
-                    // append the continuations from the next pages to 'recordbuf'.
-                    self.recordbuf.reserve(xl_tot_len as usize);
-                    self.contlen = xl_tot_len;
-                    continue;
-                }
-            } else {
-                // we're continuing a record, possibly from previous page.
-                let pageleft = self.lsn.remaining_in_block() as u32;
-
-                // read the rest of the record, or as much as fits on this page.
-                let n = min(self.contlen, pageleft) as usize;
-
-                if self.inputbuf.remaining() < n {
-                    return Ok(None);
-                }
-
-                self.recordbuf.put(self.inputbuf.split_to(n));
-                self.lsn += n as u64;
-                self.contlen -= n as u32;
-
-                if self.contlen == 0 {
-                    // The record is now complete.
-                    recordbuf = std::mem::replace(&mut self.recordbuf, BytesMut::new()).freeze();
-                    break;
-                }
-                continue;
            }
        }
+    }

+    fn complete_record(&mut self, recordbuf: Bytes) -> Result<(Lsn, Bytes), WalDecodeError> {
        // We now have a record in the 'recordbuf' local variable.
        let xlogrec =
            XLogRecord::from_slice(&recordbuf[0..XLOG_SIZE_OF_XLOG_RECORD]).map_err(|e| {
@@ -218,18 +265,20 @@ impl WalStreamDecoder {

        // XLOG_SWITCH records are special. If we see one, we need to skip
        // to the next WAL segment.
-        if xlogrec.is_xlog_switch_record() {
+        let next_lsn = if xlogrec.is_xlog_switch_record() {
            trace!("saw xlog switch record at {}", self.lsn);
-            self.padlen = self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64) as u32;
+            self.lsn + self.lsn.calc_padding(pg_constants::WAL_SEGMENT_SIZE as u64)
        } else {
            // Pad to an 8-byte boundary
-            self.padlen = self.lsn.calc_padding(8u32) as u32;
-        }
+            self.lsn.align()
+        };
+        self.state = State::SkippingEverything {
+            skip_until_lsn: next_lsn,
+        };

-        // Always align resulting LSN on 0x8 boundary -- that is important for getPage()
-        // and WalReceiver integration. Since this code is used both for WalReceiver and
-        // initial WAL import let's force alignment right here.
-        let result = (self.lsn.align(), recordbuf);
-        Ok(Some(result))
+        // We should return LSN of the next record, not the last byte of this record or
+        // the byte immediately after. Note that this handles both XLOG_SWITCH and usual
+        // records, the former "spans" until the next WAL segment (see test_xlog_switch).
+        Ok((next_lsn, recordbuf))
    }
 }
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -15,7 +15,8 @@ use crate::XLogPageHeaderData;
 use crate::XLogRecord;
 use crate::XLOG_PAGE_MAGIC;

-use anyhow::{bail, ensure};
+use crate::pg_constants::WAL_SEGMENT_SIZE;
+use anyhow::{anyhow, bail, ensure};
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use bytes::{Buf, Bytes};
@@ -158,7 +159,7 @@ fn find_end_of_wal_segment(
    let mut buf = [0u8; XLOG_BLCKSZ];
    let file_name = XLogFileName(tli, segno, wal_seg_size);
    let mut last_valid_rec_pos: usize = start_offset; // assume at given start_offset begins new record
-    let mut file = File::open(data_dir.join(file_name.clone() + ".partial")).unwrap();
+    let mut file = File::open(data_dir.join(file_name.clone() + ".partial"))?;
    file.seek(SeekFrom::Start(offs as u64))?;
    // xl_crc is the last field in XLogRecord, will not be read into rec_hdr
    const_assert!(XLOG_RECORD_CRC_OFFS + 4 == XLOG_SIZE_OF_XLOG_RECORD);
@@ -395,10 +396,13 @@ pub fn find_end_of_wal(
    let mut high_tli: TimeLineID = 0;
    let mut high_ispartial = false;

-    for entry in fs::read_dir(data_dir).unwrap().flatten() {
+    for entry in fs::read_dir(data_dir)?.flatten() {
        let ispartial: bool;
        let entry_name = entry.file_name();
-        let fname = entry_name.to_str().unwrap();
+        let fname = entry_name
+            .to_str()
+            .ok_or_else(|| anyhow!("Invalid file name"))?;
+
        /*
         * Check if the filename looks like an xlog file, or a .partial file.
         */
@@ -410,7 +414,7 @@ pub fn find_end_of_wal(
            continue;
        }
        let (segno, tli) = XLogFromFileName(fname, wal_seg_size);
-        if !ispartial && entry.metadata().unwrap().len() != wal_seg_size as u64 {
+        if !ispartial && entry.metadata()?.len() != wal_seg_size as u64 {
            continue;
        }
        if segno > high_segno
@@ -461,8 +465,7 @@ pub fn find_end_of_wal(
 pub fn main() {
    let mut data_dir = PathBuf::new();
    data_dir.push(".");
-    let wal_seg_size = 16 * 1024 * 1024;
-    let (wal_end, tli) = find_end_of_wal(&data_dir, wal_seg_size, true, Lsn(0)).unwrap();
+    let (wal_end, tli) = find_end_of_wal(&data_dir, WAL_SEGMENT_SIZE, true, Lsn(0)).unwrap();
    println!(
        "wal_end={:>08X}{:>08X}, tli={}",
        (wal_end >> 32) as u32,
@@ -597,20 +600,18 @@ mod tests {
    fn init_logging() {
        let _ = env_logger::Builder::from_env(
            env_logger::Env::default()
-                .default_filter_or("wal_generate=info,postgres_ffi::xlog_utils=trace"),
+                .default_filter_or("wal_craft=info,postgres_ffi::xlog_utils=trace"),
        )
        .is_test(true)
        .try_init();
    }

-    fn test_end_of_wal(
+    fn test_end_of_wal<C: wal_craft::Crafter>(
        test_name: &str,
-        generate_wal: impl Fn(&mut postgres::Client) -> anyhow::Result<postgres::types::PgLsn>,
        expected_end_of_wal_non_partial: Lsn,
-        last_segment: &str,
    ) {
-        use wal_generate::*;
-        // 1. Generate some WAL
+        use wal_craft::*;
+        // Craft some WAL
        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
            .join("..")
            .join("..");
@@ -622,25 +623,72 @@ mod tests {
            fs::remove_dir_all(&cfg.datadir).unwrap();
        }
        cfg.initdb().unwrap();
-        let mut srv = cfg.start_server().unwrap();
-        let expected_wal_end: Lsn =
-            u64::from(generate_wal(&mut srv.connect_with_timeout().unwrap()).unwrap()).into();
+        let srv = cfg.start_server().unwrap();
+        let (intermediate_lsns, expected_end_of_wal_partial) =
+            C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+        let intermediate_lsns: Vec<Lsn> = intermediate_lsns
+            .iter()
+            .map(|&lsn| u64::from(lsn).into())
+            .collect();
+        let expected_end_of_wal_partial: Lsn = u64::from(expected_end_of_wal_partial).into();
        srv.kill();

-        // 2. Pick WAL generated by initdb
-        let wal_dir = cfg.datadir.join("pg_wal");
-        let wal_seg_size = 16 * 1024 * 1024;
+        // Check find_end_of_wal on the initial WAL
+        let last_segment = cfg
+            .wal_dir()
+            .read_dir()
+            .unwrap()
+            .map(|f| f.unwrap().file_name().into_string().unwrap())
+            .filter(|fname| IsXLogFileName(fname))
+            .max()
+            .unwrap();
+        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal_partial);
+        for start_lsn in std::iter::once(Lsn(0))
+            .chain(intermediate_lsns)
+            .chain(std::iter::once(expected_end_of_wal_partial))
+        {
+            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
+            // We assume that `start_lsn` is non-decreasing.
+            info!(
+                "Checking with start_lsn={}, erasing WAL before it",
+                start_lsn
+            );
+            for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
+                let fname = file.file_name().into_string().unwrap();
+                if !IsXLogFileName(&fname) {
+                    continue;
+                }
+                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+                if seg_start_lsn > u64::from(start_lsn) {
+                    continue;
+                }
+                let mut f = File::options().write(true).open(file.path()).unwrap();
+                const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
+                f.write_all(
+                    &ZEROS[0..min(
+                        WAL_SEGMENT_SIZE,
+                        (u64::from(start_lsn) - seg_start_lsn) as usize,
+                    )],
+                )
+                .unwrap();
+            }
+            check_end_of_wal(
+                &cfg,
+                &last_segment,
+                start_lsn,
+                expected_end_of_wal_non_partial,
+                expected_end_of_wal_partial,
+            );
+        }
+    }

-        // 3. Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
-        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
-        let wal_end = Lsn(wal_end);
-        info!(
-            "find_end_of_wal returned (wal_end={}, tli={})",
-            wal_end, tli
-        );
-        assert_eq!(wal_end, expected_end_of_wal_non_partial);
-
-        // 4. Get the actual end of WAL by pg_waldump
+    fn check_pg_waldump_end_of_wal(
+        cfg: &wal_craft::Conf,
+        last_segment: &str,
+        expected_end_of_wal: Lsn,
+    ) {
+        // Get the actual end of WAL by pg_waldump
        let waldump_output = cfg
            .pg_waldump("000000010000000000000001", last_segment)
            .unwrap()
@@ -659,44 +707,66 @@ mod tests {
        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
        info!(
            "waldump erred on {}, expected wal end at {}",
-            waldump_wal_end, expected_wal_end
+            waldump_wal_end, expected_end_of_wal
        );
-        assert_eq!(waldump_wal_end, expected_wal_end);
+        assert_eq!(waldump_wal_end, expected_end_of_wal);
+    }

-        // 5. Rename file to partial to actually find last valid lsn
-        fs::rename(
-            wal_dir.join(last_segment),
-            wal_dir.join(format!("{}.partial", last_segment)),
-        )
-        .unwrap();
-        let (wal_end, tli) = find_end_of_wal(&wal_dir, wal_seg_size, true, Lsn(0)).unwrap();
+    fn check_end_of_wal(
+        cfg: &wal_craft::Conf,
+        last_segment: &str,
+        start_lsn: Lsn,
+        expected_end_of_wal_non_partial: Lsn,
+        expected_end_of_wal_partial: Lsn,
+    ) {
+        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
+        let (wal_end, tli) =
+            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
        let wal_end = Lsn(wal_end);
        info!(
-            "find_end_of_wal returned (wal_end={}, tli={})",
+            "find_end_of_wal returned (wal_end={}, tli={}) with non-partial WAL segment",
            wal_end, tli
        );
-        assert_eq!(wal_end, waldump_wal_end);
+        assert_eq!(wal_end, expected_end_of_wal_non_partial);
+
+        // Rename file to partial to actually find last valid lsn, then rename it back.
+        fs::rename(
+            cfg.wal_dir().join(&last_segment),
+            cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        )
+        .unwrap();
+        let (wal_end, tli) =
+            find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, true, start_lsn).unwrap();
+        let wal_end = Lsn(wal_end);
+        info!(
+            "find_end_of_wal returned (wal_end={}, tli={}) with partial WAL segment",
+            wal_end, tli
+        );
+        assert_eq!(wal_end, expected_end_of_wal_partial);
+        fs::rename(
+            cfg.wal_dir().join(format!("{}.partial", last_segment)),
+            cfg.wal_dir().join(last_segment),
+        )
+        .unwrap();
    }

+    const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
+
    #[test]
    pub fn test_find_end_of_wal_simple() {
        init_logging();
-        test_end_of_wal(
+        test_end_of_wal::<wal_craft::Simple>(
            "test_find_end_of_wal_simple",
-            wal_generate::generate_simple,
            "0/2000000".parse::<Lsn>().unwrap(),
-            "000000010000000000000001",
        );
    }

    #[test]
    pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
        init_logging();
-        test_end_of_wal(
+        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
-            wal_generate::generate_wal_record_crossing_segment_followed_by_small_one,
            "0/3000000".parse::<Lsn>().unwrap(),
-            "000000010000000000000002",
        );
    }

@@ -704,11 +774,9 @@ mod tests {
    #[ignore = "not yet fixed, needs correct parsing of pre-last segments"] // TODO
    pub fn test_find_end_of_wal_last_crossing_segment() {
        init_logging();
-        test_end_of_wal(
+        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
            "test_find_end_of_wal_last_crossing_segment",
-            wal_generate::generate_last_wal_record_crossing_segment,
            "0/3000000".parse::<Lsn>().unwrap(),
-            "000000010000000000000002",
        );
    }

--- a/libs/postgres_ffi/wal_generate/Cargo.toml
+++ b/libs/postgres_ffi/wal_generate/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "wal_generate"
+name = "wal_craft"
 version = "0.1.0"
 edition = "2021"

@@ -10,5 +10,7 @@ anyhow = "1.0"
 clap = "3.0"
 env_logger = "0.9"
 log = "0.4"
+once_cell = "1.13.0"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
+postgres_ffi = { path = "../" }
 tempfile = "3.2"
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -0,0 +1,103 @@
+use anyhow::*;
+use clap::{App, Arg, ArgMatches};
+use std::str::FromStr;
+use wal_craft::*;
+
+fn main() -> Result<()> {
+    env_logger::Builder::from_env(env_logger::Env::default().default_filter_or("wal_craft=info"))
+        .init();
+    let type_arg = &Arg::new("type")
+        .takes_value(true)
+        .help("Type of WAL to craft")
+        .possible_values([
+            Simple::NAME,
+            LastWalRecordXlogSwitch::NAME,
+            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME,
+            WalRecordCrossingSegmentFollowedBySmallOne::NAME,
+            LastWalRecordCrossingSegment::NAME,
+        ])
+        .required(true);
+    let arg_matches = App::new("Postgres WAL crafter")
+        .about("Crafts Postgres databases with specific WAL properties")
+        .subcommand(
+            App::new("print-postgres-config")
+                .about("Print the configuration required for PostgreSQL server before running this script")
+        )
+        .subcommand(
+            App::new("with-initdb")
+                .about("Craft WAL in a new data directory first initialized with initdb")
+                .arg(type_arg)
+                .arg(
+                    Arg::new("datadir")
+                        .takes_value(true)
+                        .help("Data directory for the Postgres server")
+                        .required(true)
+                )
+                .arg(
+                    Arg::new("pg-distrib-dir")
+                        .long("pg-distrib-dir")
+                        .takes_value(true)
+                        .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
+                        .default_value("/usr/local")
+                )
+        )
+        .subcommand(
+            App::new("in-existing")
+                .about("Craft WAL at an existing recently created Postgres database. Note that server may append new WAL entries on shutdown.")
+                .arg(type_arg)
+                .arg(
+                    Arg::new("connection")
+                        .takes_value(true)
+                        .help("Connection string to the Postgres database to populate")
+                        .required(true)
+                )
+        )
+        .get_matches();
+
+    let wal_craft = |arg_matches: &ArgMatches, client| {
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches.value_of("type").unwrap() {
+            Simple::NAME => Simple::craft(client)?,
+            LastWalRecordXlogSwitch::NAME => LastWalRecordXlogSwitch::craft(client)?,
+            LastWalRecordXlogSwitchEndsOnPageBoundary::NAME => {
+                LastWalRecordXlogSwitchEndsOnPageBoundary::craft(client)?
+            }
+            WalRecordCrossingSegmentFollowedBySmallOne::NAME => {
+                WalRecordCrossingSegmentFollowedBySmallOne::craft(client)?
+            }
+            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
+            a => panic!("Unknown --type argument: {}", a),
+        };
+        for lsn in intermediate_lsns {
+            println!("intermediate_lsn = {}", lsn);
+        }
+        println!("end_of_wal = {}", end_of_wal_lsn);
+        Ok(())
+    };
+
+    match arg_matches.subcommand() {
+        None => panic!("No subcommand provided"),
+        Some(("print-postgres-config", _)) => {
+            for cfg in REQUIRED_POSTGRES_CONFIG.iter() {
+                println!("{}", cfg);
+            }
+            Ok(())
+        }
+        Some(("with-initdb", arg_matches)) => {
+            let cfg = Conf {
+                pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
+                datadir: arg_matches.value_of("datadir").unwrap().into(),
+            };
+            cfg.initdb()?;
+            let srv = cfg.start_server()?;
+            wal_craft(arg_matches, &mut srv.connect_with_timeout()?)?;
+            srv.kill();
+            Ok(())
+        }
+        Some(("in-existing", arg_matches)) => wal_craft(
+            arg_matches,
+            &mut postgres::Config::from_str(arg_matches.value_of("connection").unwrap())?
+                .connect(postgres::NoTls)?,
+        ),
+        Some(_) => panic!("Unknown subcommand"),
+    }
+}
--- a/libs/postgres_ffi/wal_generate/src/lib.rs
+++ b/libs/postgres_ffi/wal_generate/src/lib.rs
@@ -1,9 +1,15 @@
 use anyhow::*;
 use core::time::Duration;
 use log::*;
+use once_cell::sync::Lazy;
 use postgres::types::PgLsn;
 use postgres::Client;
+use postgres_ffi::pg_constants::WAL_SEGMENT_SIZE;
+use postgres_ffi::xlog_utils::{
+    XLOG_BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 use std::cmp::Ordering;
+use std::fs;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use std::time::Instant;
@@ -21,6 +27,16 @@ pub struct PostgresServer {
    client_config: postgres::Config,
 }

+pub static REQUIRED_POSTGRES_CONFIG: Lazy<Vec<&'static str>> = Lazy::new(|| {
+    vec![
+        "wal_keep_size=50MB",            // Ensure old WAL is not removed
+        "shared_preload_libraries=neon", // can only be loaded at startup
+        // Disable background processes as much as possible
+        "wal_writer_delay=10s",
+        "autovacuum=off",
+    ]
+});
+
 impl Conf {
    fn pg_bin_dir(&self) -> PathBuf {
        self.pg_distrib_dir.join("bin")
@@ -30,6 +46,10 @@ impl Conf {
        self.pg_distrib_dir.join("lib")
    }

+    pub fn wal_dir(&self) -> PathBuf {
+        self.datadir.join("pg_wal")
+    }
+
    fn new_pg_command(&self, command: impl AsRef<Path>) -> Result<Command> {
        let path = self.pg_bin_dir().join(command);
        ensure!(path.exists(), "Command {:?} does not exist", path);
@@ -69,6 +89,12 @@ impl Conf {

    pub fn start_server(&self) -> Result<PostgresServer> {
        info!("Starting Postgres server in {:?}", self.datadir);
+        let log_file = fs::File::create(self.datadir.join("pg.log")).with_context(|| {
+            format!(
+                "Failed to create pg.log file in directory {}",
+                self.datadir.display()
+            )
+        })?;
        let unix_socket_dir = tempdir()?; // We need a directory with a short name for Unix socket (up to 108 symbols)
        let unix_socket_dir_path = unix_socket_dir.path().to_owned();
        let server_process = self
@@ -78,13 +104,9 @@ impl Conf {
            .arg(unix_socket_dir_path.as_os_str())
            .arg("-D")
            .arg(self.datadir.as_os_str())
-            .args(&["-c", "wal_keep_size=50MB"]) // Ensure old WAL is not removed
            .args(&["-c", "logging_collector=on"]) // stderr will mess up with tests output
-            .args(&["-c", "shared_preload_libraries=neon"]) // can only be loaded at startup
-            // Disable background processes as much as possible
-            .args(&["-c", "wal_writer_delay=10s"])
-            .args(&["-c", "autovacuum=off"])
-            .stderr(Stdio::null())
+            .args(REQUIRED_POSTGRES_CONFIG.iter().flat_map(|cfg| ["-c", cfg]))
+            .stderr(Stdio::from(log_file))
            .spawn()?;
        let server = PostgresServer {
            process: server_process,
@@ -137,7 +159,7 @@ impl PostgresServer {
        bail!("Connection timed out");
    }

-    pub fn kill(&mut self) {
+    pub fn kill(mut self) {
        self.process.kill().unwrap();
        self.process.wait().unwrap();
    }
@@ -174,12 +196,16 @@ pub trait PostgresClientExt: postgres::GenericClient {

 impl<C: postgres::GenericClient> PostgresClientExt for C {}

-fn generate_internal<C: postgres::GenericClient>(
-    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> Result<Option<PgLsn>>,
-) -> Result<PgLsn> {
+pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> Result<()> {
    client.execute("create extension if not exists neon_test_utils", &[])?;

+    let wal_keep_size: String = client.query_one("SHOW wal_keep_size", &[])?.get(0);
+    ensure!(wal_keep_size == "50MB");
+    let wal_writer_delay: String = client.query_one("SHOW wal_writer_delay", &[])?.get(0);
+    ensure!(wal_writer_delay == "10s");
+    let autovacuum: String = client.query_one("SHOW autovacuum", &[])?.get(0);
+    ensure!(autovacuum == "off");
+
    let wal_segment_size = client.query_one(
        "select cast(setting as bigint) as setting, unit \
         from pg_settings where name = 'wal_segment_size'",
@@ -190,44 +216,160 @@ fn generate_internal<C: postgres::GenericClient>(
        "Unexpected wal_segment_size unit"
    );
    ensure!(
-        wal_segment_size.get::<_, i64>("setting") == 16 * 1024 * 1024,
+        wal_segment_size.get::<_, i64>("setting") == WAL_SEGMENT_SIZE as i64,
        "Unexpected wal_segment_size in bytes"
    );

+    Ok(())
+}
+
+pub trait Crafter {
+    const NAME: &'static str;
+
+    /// Generates WAL using the client `client`. Returns a pair of:
+    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
+    ///   May include or exclude Lsn(0) and the end-of-wal.
+    /// * The expected end-of-wal LSN.
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)>;
+}
+
+fn craft_internal<C: postgres::GenericClient>(
+    client: &mut C,
+    f: impl Fn(&mut C, PgLsn) -> Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> Result<(Vec<PgLsn>, PgLsn)> {
+    ensure_server_config(client)?;
+
    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);

-    let last_lsn = match f(client, initial_lsn)? {
+    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
+    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
        Some(last_lsn) => match last_lsn.cmp(&client.pg_current_wal_insert_lsn()?) {
-            Ordering::Less => bail!("Some records were inserted after the generated WAL"),
+            Ordering::Less => bail!("Some records were inserted after the crafted WAL"),
            Ordering::Equal => last_lsn,
            Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
        },
    };
+    if !intermediate_lsns.starts_with(&[initial_lsn]) {
+        intermediate_lsns.insert(0, initial_lsn);
+    }

    // Some records may be not flushed, e.g. non-transactional logical messages.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
-        Ordering::Less => bail!("Some records were flushed after the generated WAL"),
+        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
        Ordering::Equal => {}
        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
    }
-    Ok(last_lsn)
+    Ok((intermediate_lsns, last_lsn))
 }

-pub fn generate_simple(client: &mut impl postgres::GenericClient) -> Result<PgLsn> {
-    generate_internal(client, |client, _| {
+pub struct Simple;
+impl Crafter for Simple {
+    const NAME: &'static str = "simple";
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+        craft_internal(client, |client, _| {
+            client.execute("CREATE table t(x int)", &[])?;
+            Ok((Vec::new(), None))
+        })
+    }
+}
+
+pub struct LastWalRecordXlogSwitch;
+impl Crafter for LastWalRecordXlogSwitch {
+    const NAME: &'static str = "last_wal_record_xlog_switch";
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+        // Do not use generate_internal because here we end up with flush_lsn exactly on
+        // the segment boundary and insert_lsn after the initial page header, which is unusual.
+        ensure_server_config(client)?;
+
        client.execute("CREATE table t(x int)", &[])?;
-        Ok(None)
-    })
+        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let next_segment = PgLsn::from(0x0200_0000);
+        ensure!(
+            after_xlog_switch <= next_segment,
+            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
+            after_xlog_switch,
+            next_segment
+        );
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+    }
 }

-fn generate_single_logical_message(
+pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
+impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
+    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+        // Do not use generate_internal because here we end up with flush_lsn exactly on
+        // the segment boundary and insert_lsn after the initial page header, which is unusual.
+        ensure_server_config(client)?;
+
+        client.execute("CREATE table t(x int)", &[])?;
+
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
+        // We will use logical message as the padding. We start with detecting how much WAL
+        // it takes for one logical message, considering all alignments and headers.
+        let base_wal_advance = {
+            let before_lsn = client.pg_current_wal_insert_lsn()?;
+            // Small non-empty message bigger than few bytes is more likely than an empty
+            // message to have the same format as the big padding message.
+            client.execute(
+                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
+                &[],
+            )?;
+            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
+                + XLOG_SIZE_OF_XLOG_RECORD
+        };
+        let mut remaining_lsn =
+            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
+        if remaining_lsn < base_wal_advance {
+            remaining_lsn += XLOG_BLCKSZ;
+        }
+        let repeats = 10 + remaining_lsn - base_wal_advance;
+        info!(
+            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
+            client.pg_current_wal_insert_lsn()?,
+            remaining_lsn,
+            base_wal_advance,
+            repeats
+        );
+        client.execute(
+            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+            &[&(repeats as i32)],
+        )?;
+        info!(
+            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
+            client.pg_current_wal_insert_lsn()?,
+            XLOG_SIZE_OF_XLOG_RECORD
+        );
+
+        // Emit the XLOG_SWITCH
+        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let next_segment = PgLsn::from(0x0200_0000);
+        ensure!(
+            after_xlog_switch < next_segment,
+            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
+            after_xlog_switch,
+            next_segment
+        );
+        ensure!(
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            "XLOG_SWITCH message ended not on page boundary: {}",
+            after_xlog_switch
+        );
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
+    }
+}
+
+fn craft_single_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> Result<PgLsn> {
-    generate_internal(client, |client, initial_lsn| {
+) -> Result<(Vec<PgLsn>, PgLsn)> {
+    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
            "Initial LSN is too far in the future"
@@ -258,21 +400,25 @@ fn generate_single_logical_message(
                message_lsn < after_message_lsn,
                "No record found after the emitted message"
            );
-            Ok(Some(after_message_lsn))
+            Ok((vec![message_lsn], Some(after_message_lsn)))
        } else {
-            Ok(Some(message_lsn))
+            Ok((Vec::new(), Some(message_lsn)))
        }
    })
 }

-pub fn generate_wal_record_crossing_segment_followed_by_small_one(
-    client: &mut impl postgres::GenericClient,
-) -> Result<PgLsn> {
-    generate_single_logical_message(client, true)
+pub struct WalRecordCrossingSegmentFollowedBySmallOne;
+impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
+    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, true)
+    }
 }

-pub fn generate_last_wal_record_crossing_segment<C: postgres::GenericClient>(
-    client: &mut C,
-) -> Result<PgLsn> {
-    generate_single_logical_message(client, false)
+pub struct LastWalRecordCrossingSegment;
+impl Crafter for LastWalRecordCrossingSegment {
+    const NAME: &'static str = "last_wal_record_crossing_segment";
+    fn craft(client: &mut impl postgres::GenericClient) -> Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, false)
+    }
 }
--- a/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
+++ b/libs/postgres_ffi/wal_generate/src/bin/wal_generate.rs
@@ -1,58 +0,0 @@
-use anyhow::*;
-use clap::{App, Arg};
-use wal_generate::*;
-
-fn main() -> Result<()> {
-    env_logger::Builder::from_env(
-        env_logger::Env::default().default_filter_or("wal_generate=info"),
-    )
-    .init();
-    let arg_matches = App::new("Postgres WAL generator")
-        .about("Generates Postgres databases with specific WAL properties")
-        .arg(
-            Arg::new("datadir")
-                .short('D')
-                .long("datadir")
-                .takes_value(true)
-                .help("Data directory for the Postgres server")
-                .required(true)
-        )
-        .arg(
-            Arg::new("pg-distrib-dir")
-                .long("pg-distrib-dir")
-                .takes_value(true)
-                .help("Directory with Postgres distribution (bin and lib directories, e.g. tmp_install)")
-                .default_value("/usr/local")
-        )
-        .arg(
-            Arg::new("type")
-                .long("type")
-                .takes_value(true)
-                .help("Type of WAL to generate")
-                .possible_values(["simple", "last_wal_record_crossing_segment", "wal_record_crossing_segment_followed_by_small_one"])
-                .required(true)
-        )
-        .get_matches();
-
-    let cfg = Conf {
-        pg_distrib_dir: arg_matches.value_of("pg-distrib-dir").unwrap().into(),
-        datadir: arg_matches.value_of("datadir").unwrap().into(),
-    };
-    cfg.initdb()?;
-    let mut srv = cfg.start_server()?;
-    let lsn = match arg_matches.value_of("type").unwrap() {
-        "simple" => generate_simple(&mut srv.connect_with_timeout()?)?,
-        "last_wal_record_crossing_segment" => {
-            generate_last_wal_record_crossing_segment(&mut srv.connect_with_timeout()?)?
-        }
-        "wal_record_crossing_segment_followed_by_small_one" => {
-            generate_wal_record_crossing_segment_followed_by_small_one(
-                &mut srv.connect_with_timeout()?,
-            )?
-        }
-        a => panic!("Unknown --type argument: {}", a),
-    };
-    println!("end_of_wal = {}", lsn);
-    srv.kill();
-    Ok(())
-}
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -7,7 +7,7 @@ edition = "2021"
 anyhow = { version = "1.0", features = ["backtrace"] }
 async-trait = "0.1"
 metrics = { version = "0.1", path = "../metrics" }
-once_cell = "1.8.0"
+once_cell = "1.13.0"
 rusoto_core = "0.48"
 rusoto_s3 = "0.48"
 serde = { version = "1.0", features = ["derive"] }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -12,8 +12,10 @@ use std::{
    borrow::Cow,
    collections::HashMap,
    ffi::OsStr,
+    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    path::{Path, PathBuf},
+    pin::Pin,
 };

 use anyhow::{bail, Context};
@@ -40,13 +42,19 @@ pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;

+pub trait RemoteObjectName {
+    // Needed to retrieve last component for RemoteObjectId.
+    // In other words a file name
+    fn object_name(&self) -> Option<&str>;
+}
+
 /// Storage (potentially remote) API to manage its state.
 /// This storage tries to be unaware of any layered repository context,
 /// providing basic CRUD operations for storage files.
 #[async_trait::async_trait]
 pub trait RemoteStorage: Send + Sync {
    /// A way to uniquely reference a file in the remote storage.
-    type RemoteObjectId;
+    type RemoteObjectId: RemoteObjectName;

    /// Attempts to derive the storage path out of the local path, if the latter is correct.
    fn remote_object_id(&self, local_path: &Path) -> anyhow::Result<Self::RemoteObjectId>;
@@ -57,6 +65,15 @@ pub trait RemoteStorage: Send + Sync {
    /// Lists all items the storage has right now.
    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>>;

+    /// Lists all top level subdirectories for a given prefix
+    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
+    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
+    /// so this method doesnt need to.
+    async fn list_prefixes(
+        &self,
+        prefix: Option<Self::RemoteObjectId>,
+    ) -> anyhow::Result<Vec<Self::RemoteObjectId>>;
+
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
@@ -70,11 +87,7 @@ pub trait RemoteStorage: Send + Sync {

    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
-    async fn download(
-        &self,
-        from: &Self::RemoteObjectId,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
+    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError>;

    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
    /// Returns the metadata, if any was stored with the file previously.
@@ -83,12 +96,49 @@ pub trait RemoteStorage: Send + Sync {
        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>>;
+    ) -> Result<Download, DownloadError>;

    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()>;
 }

+pub struct Download {
+    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send>>,
+    /// Extra key-value data, associated with the current remote file.
+    pub metadata: Option<StorageMetadata>,
+}
+
+impl Debug for Download {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Download")
+            .field("metadata", &self.metadata)
+            .finish()
+    }
+}
+
+#[derive(Debug)]
+pub enum DownloadError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The file was not found in the remote storage.
+    NotFound,
+    /// The file was found in the remote storage, but the download failed.
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for DownloadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DownloadError::BadInput(e) => {
+                write!(f, "Failed to download a remote file due to user input: {e}")
+            }
+            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
+            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e}"),
+        }
+    }
+}
+
+impl std::error::Error for DownloadError {}
+
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 pub enum GenericRemoteStorage {
@@ -180,7 +230,7 @@ pub struct S3Config {
    pub concurrency_limit: NonZeroUsize,
 }

-impl std::fmt::Debug for S3Config {
+impl Debug for S3Config {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("S3Config")
            .field("bucket_name", &self.bucket_name)
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,6 +5,7 @@
 //! volume is mounted to the local FS.

 use std::{
+    borrow::Cow,
    future::Future,
    path::{Path, PathBuf},
    pin::Pin,
@@ -17,10 +18,16 @@ use tokio::{
 };
 use tracing::*;

-use crate::path_with_suffix_extension;
+use crate::{path_with_suffix_extension, Download, DownloadError, RemoteObjectName};

 use super::{strip_path_prefix, RemoteStorage, StorageMetadata};

+impl RemoteObjectName for PathBuf {
+    fn object_name(&self) -> Option<&str> {
+        self.file_stem().and_then(|n| n.to_str())
+    }
+}
+
 pub struct LocalFs {
    working_directory: PathBuf,
    storage_root: PathBuf,
@@ -101,7 +108,18 @@ impl RemoteStorage for LocalFs {
    }

    async fn list(&self) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
-        get_all_files(&self.storage_root).await
+        get_all_files(&self.storage_root, true).await
+    }
+
+    async fn list_prefixes(
+        &self,
+        prefix: Option<Self::RemoteObjectId>,
+    ) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+        let path = match prefix {
+            Some(prefix) => Cow::Owned(prefix),
+            None => Cow::Borrowed(&self.storage_root),
+        };
+        get_all_files(path.as_ref(), false).await
    }

    async fn upload(
@@ -192,15 +210,12 @@ impl RemoteStorage for LocalFs {
        Ok(())
    }

-    async fn download(
-        &self,
-        from: &Self::RemoteObjectId,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
-        let file_path = self.resolve_in_storage(from)?;
-
-        if file_path.exists() && file_path.is_file() {
-            let mut source = io::BufReader::new(
+    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
+        let file_path = self
+            .resolve_in_storage(from)
+            .map_err(DownloadError::BadInput)?;
+        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
+            let source = io::BufReader::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&file_path)
@@ -210,22 +225,20 @@ impl RemoteStorage for LocalFs {
                            "Failed to open source file '{}' to use in the download",
                            file_path.display()
                        )
-                    })?,
+                    })
+                    .map_err(DownloadError::Other)?,
            );
-            io::copy(&mut source, to).await.with_context(|| {
-                format!(
-                    "Failed to download file '{}' from the local storage",
-                    file_path.display()
-                )
-            })?;
-            source.flush().await?;

-            self.read_storage_metadata(&file_path).await
+            let metadata = self
+                .read_storage_metadata(&file_path)
+                .await
+                .map_err(DownloadError::Other)?;
+            Ok(Download {
+                metadata,
+                download_stream: Box::pin(source),
+            })
        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
+            Err(DownloadError::NotFound)
        }
    }

@@ -234,22 +247,19 @@ impl RemoteStorage for LocalFs {
        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
+    ) -> Result<Download, DownloadError> {
        if let Some(end_exclusive) = end_exclusive {
-            ensure!(
-                end_exclusive > start_inclusive,
-                "Invalid range, start ({}) is bigger then end ({:?})",
-                start_inclusive,
-                end_exclusive
-            );
+            if end_exclusive <= start_inclusive {
+                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})")));
+            };
            if start_inclusive == end_exclusive.saturating_sub(1) {
-                return Ok(None);
+                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
-        let file_path = self.resolve_in_storage(from)?;
-
-        if file_path.exists() && file_path.is_file() {
+        let file_path = self
+            .resolve_in_storage(from)
+            .map_err(DownloadError::BadInput)?;
+        if file_exists(&file_path).map_err(DownloadError::BadInput)? {
            let mut source = io::BufReader::new(
                fs::OpenOptions::new()
                    .read(true)
@@ -260,31 +270,31 @@ impl RemoteStorage for LocalFs {
                            "Failed to open source file '{}' to use in the download",
                            file_path.display()
                        )
-                    })?,
+                    })
+                    .map_err(DownloadError::Other)?,
            );
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
-                .context("Failed to seek to the range start in a local storage file")?;
-            match end_exclusive {
-                Some(end_exclusive) => {
-                    io::copy(&mut source.take(end_exclusive - start_inclusive), to).await
-                }
-                None => io::copy(&mut source, to).await,
-            }
-            .with_context(|| {
-                format!(
-                    "Failed to download file '{}' range from the local storage",
-                    file_path.display()
-                )
-            })?;
+                .context("Failed to seek to the range start in a local storage file")
+                .map_err(DownloadError::Other)?;
+            let metadata = self
+                .read_storage_metadata(&file_path)
+                .await
+                .map_err(DownloadError::Other)?;

-            self.read_storage_metadata(&file_path).await
+            Ok(match end_exclusive {
+                Some(end_exclusive) => Download {
+                    metadata,
+                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
+                },
+                None => Download {
+                    metadata,
+                    download_stream: Box::pin(source),
+                },
+            })
        } else {
-            bail!(
-                "File '{}' either does not exist or is not a file",
-                file_path.display()
-            )
+            Err(DownloadError::NotFound)
        }
    }

@@ -307,6 +317,7 @@ fn storage_metadata_path(original_path: &Path) -> PathBuf {

 fn get_all_files<'a, P>(
    directory_path: P,
+    recursive: bool,
 ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
 where
    P: AsRef<Path> + Send + Sync + 'a,
@@ -323,7 +334,11 @@ where
                    if file_type.is_symlink() {
                        debug!("{:?} us a symlink, skipping", entry_path)
                    } else if file_type.is_dir() {
-                        paths.extend(get_all_files(entry_path).await?.into_iter())
+                        if recursive {
+                            paths.extend(get_all_files(entry_path, true).await?.into_iter())
+                        } else {
+                            paths.push(dir_entry.path())
+                        }
                    } else {
                        paths.push(dir_entry.path());
                    }
@@ -352,6 +367,19 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
    Ok(())
 }

+fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
+    if file_path.exists() {
+        ensure!(
+            file_path.is_file(),
+            "file path '{}' is not a file",
+            file_path.display()
+        );
+        Ok(true)
+    } else {
+        Ok(false)
+    }
+}
+
 #[cfg(test)]
 mod pure_tests {
    use tempfile::tempdir;
@@ -518,6 +546,31 @@ mod fs_tests {
    use std::{collections::HashMap, io::Write};
    use tempfile::tempdir;

+    async fn read_and_assert_remote_file_contents(
+        storage: &LocalFs,
+        #[allow(clippy::ptr_arg)]
+        // have to use &PathBuf due to `storage.local_path` parameter requirements
+        remote_storage_path: &PathBuf,
+        expected_metadata: Option<&StorageMetadata>,
+    ) -> anyhow::Result<String> {
+        let mut download = storage
+            .download(remote_storage_path)
+            .await
+            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
+        ensure!(
+            download.metadata.as_ref() == expected_metadata,
+            "Unexpected metadata returned for the downloaded file"
+        );
+
+        let mut contents = String::new();
+        download
+            .download_stream
+            .read_to_string(&mut contents)
+            .await
+            .context("Failed to read remote file contents into string")?;
+        Ok(contents)
+    }
+
    #[tokio::test]
    async fn upload_file() -> anyhow::Result<()> {
        let workdir = tempdir()?.path().to_owned();
@@ -568,15 +621,7 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

-        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage.download(&upload_target, &mut content_bytes).await?;
-        assert!(
-            metadata.is_none(),
-            "No metadata should be returned for no metadata upload"
-        );
-
-        content_bytes.flush().await?;
-        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
+        let contents = read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
            contents,
@@ -584,13 +629,9 @@ mod fs_tests {
        );

        let non_existing_path = PathBuf::from("somewhere").join("else");
-        match storage.download(&non_existing_path, &mut io::sink()).await {
-            Ok(_) => panic!("Should not allow downloading non-existing storage files"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&non_existing_path.display().to_string()));
-            }
+        match storage.download(&non_existing_path).await {
+            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
+            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
        Ok(())
    }
@@ -603,58 +644,31 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

-        let mut full_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage
-            .download_byte_range(&upload_target, 0, None, &mut full_range_bytes)
-            .await?;
-        assert!(
-            metadata.is_none(),
-            "No metadata should be returned for no metadata upload"
-        );
-        full_range_bytes.flush().await?;
+        let full_range_download_contents =
+            read_and_assert_remote_file_contents(&storage, &upload_target, None).await?;
        assert_eq!(
            dummy_contents(upload_name),
-            String::from_utf8(full_range_bytes.into_inner().into_inner())?,
+            full_range_download_contents,
            "Download full range should return the whole upload"
        );

-        let mut zero_range_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let same_byte = 1_000_000_000;
-        let metadata = storage
-            .download_byte_range(
-                &upload_target,
-                same_byte,
-                Some(same_byte + 1), // exclusive end
-                &mut zero_range_bytes,
-            )
-            .await?;
-        assert!(
-            metadata.is_none(),
-            "No metadata should be returned for no metadata upload"
-        );
-        zero_range_bytes.flush().await?;
-        assert!(
-            zero_range_bytes.into_inner().into_inner().is_empty(),
-            "Zero byte range should not download any part of the file"
-        );
-
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage
-            .download_byte_range(
-                &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
-                &mut first_part_remote,
-            )
+        let mut first_part_download = storage
+            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
        assert!(
-            metadata.is_none(),
+            first_part_download.metadata.is_none(),
            "No metadata should be returned for no metadata upload"
        );

+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut first_part_download.download_stream,
+            &mut first_part_remote,
+        )
+        .await?;
        first_part_remote.flush().await?;
        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -663,20 +677,24 @@ mod fs_tests {
            "First part bytes should be returned when requested"
        );

-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let metadata = storage
+        let mut second_part_download = storage
            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
                Some((first_part_local.len() + second_part_local.len()) as u64),
-                &mut second_part_remote,
            )
            .await?;
        assert!(
-            metadata.is_none(),
+            second_part_download.metadata.is_none(),
            "No metadata should be returned for no metadata upload"
        );

+        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut second_part_download.download_stream,
+            &mut second_part_remote,
+        )
+        .await?;
        second_part_remote.flush().await?;
        let second_part_remote = second_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -696,11 +714,30 @@ mod fs_tests {
        let upload_name = "upload_1";
        let upload_target = upload_dummy_file(&workdir, &storage, upload_name, None).await?;

+        let start = 1_000_000_000;
+        let end = start + 1;
+        match storage
+            .download_byte_range(
+                &upload_target,
+                start,
+                Some(end), // exclusive end
+            )
+            .await
+        {
+            Ok(_) => panic!("Should not allow downloading wrong ranges"),
+            Err(e) => {
+                let error_string = e.to_string();
+                assert!(error_string.contains("zero bytes"));
+                assert!(error_string.contains(&start.to_string()));
+                assert!(error_string.contains(&end.to_string()));
+            }
+        }
+
        let start = 10000;
        let end = 234;
        assert!(start > end, "Should test an incorrect range");
        match storage
-            .download_byte_range(&upload_target, start, Some(end), &mut io::sink())
+            .download_byte_range(&upload_target, start, Some(end))
            .await
        {
            Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -712,18 +749,6 @@ mod fs_tests {
            }
        }

-        let non_existing_path = PathBuf::from("somewhere").join("else");
-        match storage
-            .download_byte_range(&non_existing_path, 1, Some(3), &mut io::sink())
-            .await
-        {
-            Ok(_) => panic!("Should not allow downloading non-existing storage file ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                assert!(error_string.contains(&non_existing_path.display().to_string()));
-            }
-        }
        Ok(())
    }

@@ -762,35 +787,26 @@ mod fs_tests {
        let upload_target =
            upload_dummy_file(&workdir, &storage, upload_name, Some(metadata.clone())).await?;

-        let mut content_bytes = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let full_download_metadata = storage.download(&upload_target, &mut content_bytes).await?;
-
-        content_bytes.flush().await?;
-        let contents = String::from_utf8(content_bytes.into_inner().into_inner())?;
+        let full_range_download_contents =
+            read_and_assert_remote_file_contents(&storage, &upload_target, Some(&metadata)).await?;
        assert_eq!(
            dummy_contents(upload_name),
-            contents,
+            full_range_download_contents,
            "We should upload and download the same contents"
        );

-        assert_eq!(
-            full_download_metadata.as_ref(),
-            Some(&metadata),
-            "We should get the same metadata back for full download"
-        );
-
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, _) = uploaded_bytes.split_at(3);

-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
-        let partial_download_metadata = storage
-            .download_byte_range(
-                &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
-                &mut first_part_remote,
-            )
+        let mut partial_download_with_metadata = storage
+            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
+        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        io::copy(
+            &mut partial_download_with_metadata.download_stream,
+            &mut first_part_remote,
+        )
+        .await?;
        first_part_remote.flush().await?;
        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
@@ -800,8 +816,8 @@ mod fs_tests {
        );

        assert_eq!(
-            partial_download_metadata.as_ref(),
-            Some(&metadata),
+            partial_download_with_metadata.metadata,
+            Some(metadata),
            "We should get the same metadata back for partial download"
        );

@@ -843,7 +859,7 @@ mod fs_tests {
    }

    fn dummy_contents(name: &str) -> String {
-        format!("contents for {}", name)
+        format!("contents for {name}")
    }

    async fn list_files_sorted(storage: &LocalFs) -> anyhow::Result<Vec<PathBuf>> {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -9,17 +9,19 @@ use std::path::{Path, PathBuf};
 use anyhow::Context;
 use rusoto_core::{
    credential::{InstanceMetadataProvider, StaticProvider},
-    HttpClient, Region,
+    HttpClient, Region, RusotoError,
 };
 use rusoto_s3::{
-    DeleteObjectRequest, GetObjectRequest, ListObjectsV2Request, PutObjectRequest, S3Client,
-    StreamingBody, S3,
+    DeleteObjectRequest, GetObjectError, GetObjectRequest, ListObjectsV2Request, PutObjectRequest,
+    S3Client, StreamingBody, S3,
 };
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
 use tracing::debug;

-use crate::{strip_path_prefix, RemoteStorage, S3Config};
+use crate::{
+    strip_path_prefix, Download, DownloadError, RemoteObjectName, RemoteStorage, S3Config,
+};

 use super::StorageMetadata;

@@ -117,6 +119,25 @@ impl S3ObjectKey {
    }
 }

+impl RemoteObjectName for S3ObjectKey {
+    /// Turn a/b/c or a/b/c/ into c
+    fn object_name(&self) -> Option<&str> {
+        // corner case, char::to_string is not const, thats why this is more verbose than it needs to be
+        // see https://github.com/rust-lang/rust/issues/88674
+        if self.0.len() == 1 && self.0.chars().next().unwrap() == S3_PREFIX_SEPARATOR {
+            return None;
+        }
+
+        if self.0.ends_with(S3_PREFIX_SEPARATOR) {
+            self.0.rsplit(S3_PREFIX_SEPARATOR).nth(1)
+        } else {
+            self.0
+                .rsplit_once(S3_PREFIX_SEPARATOR)
+                .map(|(_, last)| last)
+        }
+    }
+}
+
 /// AWS S3 storage.
 pub struct S3Bucket {
    workdir: PathBuf,
@@ -150,17 +171,25 @@ impl S3Bucket {

        let access_key_id = std::env::var("AWS_ACCESS_KEY_ID").ok();
        let secret_access_key = std::env::var("AWS_SECRET_ACCESS_KEY").ok();
+        // session token is used when authorizing through sso
+        // which is typically the case when testing locally on developer machine
+        let session_token = std::env::var("AWS_SESSION_TOKEN").ok();

        let client = if access_key_id.is_none() && secret_access_key.is_none() {
            debug!("Using IAM-based AWS access");
            S3Client::new_with(request_dispatcher, InstanceMetadataProvider::new(), region)
        } else {
-            debug!("Using credentials-based AWS access");
+            debug!(
+                "Using credentials-based AWS access. Session token is set: {}",
+                session_token.is_some()
+            );
            S3Client::new_with(
                request_dispatcher,
-                StaticProvider::new_minimal(
+                StaticProvider::new(
                    access_key_id.unwrap_or_default(),
                    secret_access_key.unwrap_or_default(),
+                    session_token,
+                    None,
                ),
                region,
            )
@@ -187,6 +216,39 @@ impl S3Bucket {
            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
        })
    }
+
+    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+        let _guard = self
+            .concurrency_limiter
+            .acquire()
+            .await
+            .context("Concurrency limiter semaphore got closed during S3 download")
+            .map_err(DownloadError::Other)?;
+
+        metrics::inc_get_object();
+
+        match self.client.get_object(request).await {
+            Ok(object_output) => match object_output.body {
+                None => {
+                    metrics::inc_get_object_fail();
+                    Err(DownloadError::Other(anyhow::anyhow!(
+                        "Got no body for the S3 object given"
+                    )))
+                }
+                Some(body) => Ok(Download {
+                    metadata: object_output.metadata.map(StorageMetadata),
+                    download_stream: Box::pin(io::BufReader::new(body.into_async_read())),
+                }),
+            },
+            Err(RusotoError::Service(GetObjectError::NoSuchKey(_))) => Err(DownloadError::NotFound),
+            Err(e) => {
+                metrics::inc_get_object_fail();
+                Err(DownloadError::Other(anyhow::anyhow!(
+                    "Failed to download S3 object: {e}"
+                )))
+            }
+        }
+    }
 }

 #[async_trait::async_trait]
@@ -250,6 +312,69 @@ impl RemoteStorage for S3Bucket {
        Ok(document_keys)
    }

+    /// See the doc for `RemoteStorage::list_prefixes`
+    /// Note: it wont include empty "directories"
+    async fn list_prefixes(
+        &self,
+        prefix: Option<Self::RemoteObjectId>,
+    ) -> anyhow::Result<Vec<Self::RemoteObjectId>> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let list_prefix = prefix
+            .map(|p| p.0)
+            .or_else(|| self.prefix_in_bucket.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if !p.ends_with(S3_PREFIX_SEPARATOR) {
+                    p.push(S3_PREFIX_SEPARATOR);
+                }
+                p
+            });
+
+        let mut document_keys = Vec::new();
+
+        let mut continuation_token = None;
+        loop {
+            let _guard = self
+                .concurrency_limiter
+                .acquire()
+                .await
+                .context("Concurrency limiter semaphore got closed during S3 list")?;
+
+            metrics::inc_list_objects();
+
+            let fetch_response = self
+                .client
+                .list_objects_v2(ListObjectsV2Request {
+                    bucket: self.bucket_name.clone(),
+                    prefix: list_prefix.clone(),
+                    continuation_token,
+                    delimiter: Some(S3_PREFIX_SEPARATOR.to_string()),
+                    ..ListObjectsV2Request::default()
+                })
+                .await
+                .map_err(|e| {
+                    metrics::inc_list_objects_fail();
+                    e
+                })?;
+
+            document_keys.extend(
+                fetch_response
+                    .common_prefixes
+                    .unwrap_or_default()
+                    .into_iter()
+                    .filter_map(|o| Some(S3ObjectKey(o.prefix?))),
+            );
+
+            match fetch_response.continuation_token {
+                Some(new_token) => continuation_token = Some(new_token),
+                None => break,
+            }
+        }
+
+        Ok(document_keys)
+    }
+
    async fn upload(
        &self,
        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
@@ -283,38 +408,13 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn download(
-        &self,
-        from: &Self::RemoteObjectId,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 download")?;
-
-        metrics::inc_get_object();
-
-        let object_output = self
-            .client
-            .get_object(GetObjectRequest {
-                bucket: self.bucket_name.clone(),
-                key: from.key().to_owned(),
-                ..GetObjectRequest::default()
-            })
-            .await
-            .map_err(|e| {
-                metrics::inc_get_object_fail();
-                e
-            })?;
-
-        if let Some(body) = object_output.body {
-            let mut from = io::BufReader::new(body.into_async_read());
-            io::copy(&mut from, to).await?;
-        }
-
-        Ok(object_output.metadata.map(StorageMetadata))
+    async fn download(&self, from: &Self::RemoteObjectId) -> Result<Download, DownloadError> {
+        self.download_object(GetObjectRequest {
+            bucket: self.bucket_name.clone(),
+            key: from.key().to_owned(),
+            ..GetObjectRequest::default()
+        })
+        .await
    }

    async fn download_byte_range(
@@ -322,8 +422,7 @@ impl RemoteStorage for S3Bucket {
        from: &Self::RemoteObjectId,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
-        to: &mut (impl io::AsyncWrite + Unpin + Send + Sync),
-    ) -> anyhow::Result<Option<StorageMetadata>> {
+    ) -> Result<Download, DownloadError> {
        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
        // and needs both ends to be exclusive
        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
@@ -331,34 +430,14 @@ impl RemoteStorage for S3Bucket {
            Some(end_inclusive) => format!("bytes={}-{}", start_inclusive, end_inclusive),
            None => format!("bytes={}-", start_inclusive),
        });
-        let _guard = self
-            .concurrency_limiter
-            .acquire()
-            .await
-            .context("Concurrency limiter semaphore got closed during S3 range download")?;

-        metrics::inc_get_object();
-
-        let object_output = self
-            .client
-            .get_object(GetObjectRequest {
-                bucket: self.bucket_name.clone(),
-                key: from.key().to_owned(),
-                range,
-                ..GetObjectRequest::default()
-            })
-            .await
-            .map_err(|e| {
-                metrics::inc_get_object_fail();
-                e
-            })?;
-
-        if let Some(body) = object_output.body {
-            let mut from = io::BufReader::new(body.into_async_read());
-            io::copy(&mut from, to).await?;
-        }
-
-        Ok(object_output.metadata.map(StorageMetadata))
+        self.download_object(GetObjectRequest {
+            bucket: self.bucket_name.clone(),
+            key: from.key().to_owned(),
+            range,
+            ..GetObjectRequest::default()
+        })
+        .await
    }

    async fn delete(&self, path: &Self::RemoteObjectId) -> anyhow::Result<()> {
@@ -391,6 +470,25 @@ mod tests {

    use super::*;

+    #[test]
+    fn object_name() {
+        let k = S3ObjectKey("a/b/c".to_owned());
+        assert_eq!(k.object_name(), Some("c"));
+
+        let k = S3ObjectKey("a/b/c/".to_owned());
+        assert_eq!(k.object_name(), Some("c"));
+
+        let k = S3ObjectKey("a/".to_owned());
+        assert_eq!(k.object_name(), Some("a"));
+
+        // XXX is it impossible to have an empty key?
+        let k = S3ObjectKey("".to_owned());
+        assert_eq!(k.object_name(), None);
+
+        let k = S3ObjectKey("/".to_owned());
+        assert_eq!(k.object_name(), None);
+    }
+
    #[test]
    fn download_destination() -> anyhow::Result<()> {
        let workdir = tempdir()?.path().to_owned();
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -8,7 +8,6 @@ anyhow = "1.0"
 bincode = "1.3"
 bytes = "1.0.1"
 hyper = { version = "0.14.7", features = ["full"] }
-lazy_static = "1.4.0"
 pin-project-lite = "0.2.7"
 postgres = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
 postgres-protocol = { git = "https://github.com/zenithdb/rust-postgres.git", rev="d052ee8b86fff9897c77b0fe89ea9daba0e1fa38" }
@@ -28,6 +27,8 @@ rustls = "0.20.2"
 rustls-split = "0.3.0"
 git-version = "0.3.5"
 serde_with = "1.12.0"
+once_cell = "1.13.0"
+

 metrics = { path = "../metrics" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -4,8 +4,8 @@ use crate::zid::ZTenantId;
 use anyhow::anyhow;
 use hyper::header::AUTHORIZATION;
 use hyper::{header::CONTENT_TYPE, Body, Request, Response, Server};
-use lazy_static::lazy_static;
 use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
+use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::RequestInfo;
 use routerify::{Middleware, Router, RouterBuilder, RouterService};
@@ -16,13 +16,13 @@ use std::net::TcpListener;

 use super::error::ApiError;

-lazy_static! {
-    static ref SERVE_METRICS_COUNT: IntCounter = register_int_counter!(
+static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
        "libmetrics_metric_handler_requests_total",
        "Number of metric requests made"
    )
-    .expect("failed to define a metric");
-}
+    .expect("failed to define a metric")
+});

 async fn logger(res: Response<Body>, info: RequestInfo) -> Result<Response<Body>, ApiError> {
    info!("{} {} {}", info.method(), info.uri().path(), res.status(),);
--- a/libs/utils/src/postgres_backend.rs
+++ b/libs/utils/src/postgres_backend.rs
@@ -13,13 +13,10 @@ use std::fmt;
 use std::io::{self, Write};
 use std::net::{Shutdown, SocketAddr, TcpStream};
 use std::str::FromStr;
-use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 use tracing::*;

-static PGBACKEND_SHUTDOWN_REQUESTED: AtomicBool = AtomicBool::new(false);
-
 pub trait Handler {
    /// Handle single query.
    /// postgres_backend will issue ReadyForQuery after calling this (this
@@ -45,6 +42,10 @@ pub trait Handler {
    fn check_auth_jwt(&mut self, _pgb: &mut PostgresBackend, _jwt_response: &[u8]) -> Result<()> {
        bail!("JWT auth failed")
    }
+
+    fn is_shutdown_requested(&self) -> bool {
+        false
+    }
 }

 /// PostgresBackend protocol state.
@@ -274,7 +275,7 @@ impl PostgresBackend {

        let mut unnamed_query_string = Bytes::new();

-        while !PGBACKEND_SHUTDOWN_REQUESTED.load(Ordering::Relaxed) {
+        while !handler.is_shutdown_requested() {
            match self.read_message() {
                Ok(message) => {
                    if let Some(msg) = message {
@@ -493,8 +494,3 @@ impl PostgresBackend {
        Ok(ProcessMsgResult::Continue)
    }
 }
-
-// Set the flag to inform connections to cancel
-pub fn set_pgbackend_shutdown_requested() {
-    PGBACKEND_SHUTDOWN_REQUESTED.swap(true, Ordering::Relaxed);
-}
--- a/libs/utils/src/pq_proto.rs
+++ b/libs/utils/src/pq_proto.rs
@@ -47,10 +47,12 @@ pub enum FeStartupPacket {
    StartupMessage {
        major_version: u32,
        minor_version: u32,
-        params: HashMap<String, String>,
+        params: StartupMessageParams,
    },
 }

+pub type StartupMessageParams = HashMap<String, String>;
+
 #[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
 pub struct CancelKeyData {
    pub backend_pid: i32,
@@ -269,15 +271,18 @@ impl FeStartupPacket {
                            .next()
                            .context("expected even number of params in StartupMessage")?;
                        if name == "options" {
-                            //parsing options arguments "..&options=<var>:<val>,.."
-                            //extended example and set of options:
-                            //https://github.com/neondatabase/neon/blob/main/docs/rfcs/016-connection-routing.md#connection-url
-                            for cmdopt in value.split(',') {
-                                let nameval: Vec<&str> = cmdopt.split(':').collect();
+                            // parsing options arguments "...&options=<var0>%3D<val0>+<var1>=<var1>..."
+                            // '%3D' is '=' and '+' is ' '
+
+                            // Note: we allow users that don't have SNI capabilities,
+                            // to pass a special keyword argument 'project'
+                            // to be used to determine the cluster name by the proxy.
+
+                            //TODO: write unit test for this and refactor in its own function.
+                            for cmdopt in value.split(' ') {
+                                let nameval: Vec<&str> = cmdopt.split('=').collect();
                                if nameval.len() == 2 {
                                    params.insert(nameval[0].to_string(), nameval[1].to_string());
-                                } else {
-                                    //todo: inform user / throw error message if options format is wrong.
                                }
                            }
                        } else {
@@ -923,10 +928,10 @@ impl<'a> BeMessage<'a> {
    }
 }

-// Zenith extension of postgres replication protocol
-// See ZENITH_STATUS_UPDATE_TAG_BYTE
+// Neon extension of postgres replication protocol
+// See NEON_STATUS_UPDATE_TAG_BYTE
 #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
-pub struct ZenithFeedback {
+pub struct ReplicationFeedback {
    // Last known size of the timeline. Used to enforce timeline size limit.
    pub current_timeline_size: u64,
    // Parts of StandbyStatusUpdate we resend to compute via safekeeper
@@ -936,13 +941,13 @@ pub struct ZenithFeedback {
    pub ps_replytime: SystemTime,
 }

-// NOTE: Do not forget to increment this number when adding new fields to ZenithFeedback.
+// NOTE: Do not forget to increment this number when adding new fields to ReplicationFeedback.
 // Do not remove previously available fields because this might be backwards incompatible.
-pub const ZENITH_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+pub const REPLICATION_FEEDBACK_FIELDS_NUMBER: u8 = 5;

-impl ZenithFeedback {
-    pub fn empty() -> ZenithFeedback {
-        ZenithFeedback {
+impl ReplicationFeedback {
+    pub fn empty() -> ReplicationFeedback {
+        ReplicationFeedback {
            current_timeline_size: 0,
            ps_writelsn: 0,
            ps_applylsn: 0,
@@ -951,7 +956,7 @@ impl ZenithFeedback {
        }
    }

-    // Serialize ZenithFeedback using custom format
+    // Serialize ReplicationFeedback using custom format
    // to support protocol extensibility.
    //
    // Following layout is used:
@@ -962,7 +967,7 @@ impl ZenithFeedback {
    // uint32 - value length in bytes
    // value itself
    pub fn serialize(&self, buf: &mut BytesMut) -> Result<()> {
-        buf.put_u8(ZENITH_FEEDBACK_FIELDS_NUMBER); // # of keys
+        buf.put_u8(REPLICATION_FEEDBACK_FIELDS_NUMBER); // # of keys
        write_cstr(&Bytes::from("current_timeline_size"), buf)?;
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);
@@ -989,9 +994,9 @@ impl ZenithFeedback {
        Ok(())
    }

-    // Deserialize ZenithFeedback message
-    pub fn parse(mut buf: Bytes) -> ZenithFeedback {
-        let mut zf = ZenithFeedback::empty();
+    // Deserialize ReplicationFeedback message
+    pub fn parse(mut buf: Bytes) -> ReplicationFeedback {
+        let mut zf = ReplicationFeedback::empty();
        let nfields = buf.get_u8();
        let mut i = 0;
        while i < nfields {
@@ -1032,14 +1037,14 @@ impl ZenithFeedback {
                _ => {
                    let len = buf.get_i32();
                    warn!(
-                        "ZenithFeedback parse. unknown key {} of len {}. Skip it.",
+                        "ReplicationFeedback parse. unknown key {} of len {}. Skip it.",
                        key, len
                    );
                    buf.advance(len as usize);
                }
            }
        }
-        trace!("ZenithFeedback parsed is {:?}", zf);
+        trace!("ReplicationFeedback parsed is {:?}", zf);
        zf
    }
 }
@@ -1049,8 +1054,8 @@ mod tests {
    use super::*;

    #[test]
-    fn test_zenithfeedback_serialization() {
-        let mut zf = ZenithFeedback::empty();
+    fn test_replication_feedback_serialization() {
+        let mut zf = ReplicationFeedback::empty();
        // Fill zf with some values
        zf.current_timeline_size = 12345678;
        // Set rounded time to be able to compare it with deserialized value,
@@ -1059,13 +1064,13 @@ mod tests {
        let mut data = BytesMut::new();
        zf.serialize(&mut data).unwrap();

-        let zf_parsed = ZenithFeedback::parse(data.freeze());
+        let zf_parsed = ReplicationFeedback::parse(data.freeze());
        assert_eq!(zf, zf_parsed);
    }

    #[test]
-    fn test_zenithfeedback_unknown_key() {
-        let mut zf = ZenithFeedback::empty();
+    fn test_replication_feedback_unknown_key() {
+        let mut zf = ReplicationFeedback::empty();
        // Fill zf with some values
        zf.current_timeline_size = 12345678;
        // Set rounded time to be able to compare it with deserialized value,
@@ -1076,7 +1081,7 @@ mod tests {

        // Add an extra field to the buffer and adjust number of keys
        if let Some(first) = data.first_mut() {
-            *first = ZENITH_FEEDBACK_FIELDS_NUMBER + 1;
+            *first = REPLICATION_FEEDBACK_FIELDS_NUMBER + 1;
        }

        write_cstr(&Bytes::from("new_field_one"), &mut data).unwrap();
@@ -1084,7 +1089,7 @@ mod tests {
        data.put_u64(42);

        // Parse serialized data and check that new field is not parsed
-        let zf_parsed = ZenithFeedback::parse(data.freeze());
+        let zf_parsed = ReplicationFeedback::parse(data.freeze());
        assert_eq!(zf, zf_parsed);
    }

--- a/libs/utils/tests/ssl_test.rs
+++ b/libs/utils/tests/ssl_test.rs
@@ -7,7 +7,7 @@ use std::{

 use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
-use lazy_static::lazy_static;
+use once_cell::sync::Lazy;

 use utils::postgres_backend::{AuthType, Handler, PostgresBackend};

@@ -19,16 +19,15 @@ fn make_tcp_pair() -> (TcpStream, TcpStream) {
    (server_stream, client_stream)
 }

-lazy_static! {
-    static ref KEY: rustls::PrivateKey = {
-        let mut cursor = Cursor::new(include_bytes!("key.pem"));
-        rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
-    };
-    static ref CERT: rustls::Certificate = {
-        let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-        rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
-    };
-}
+static KEY: Lazy<rustls::PrivateKey> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("key.pem"));
+    rustls::PrivateKey(rustls_pemfile::rsa_private_keys(&mut cursor).unwrap()[0].clone())
+});
+
+static CERT: Lazy<rustls::Certificate> = Lazy::new(|| {
+    let mut cursor = Cursor::new(include_bytes!("cert.pem"));
+    rustls::Certificate(rustls_pemfile::certs(&mut cursor).unwrap()[0].clone())
+});

 #[test]
 fn ssl() {
--- a/Show More
+++ b/Show More